def get_tfidf_vectors(event_name, eventgroup_id, session):
    logging.info("loading documents from DB")
    documents = events.get_documents_from_event(event_name, session)
    documents = documents[:, 0]

    logging.info("generating tfidf vectors")
    tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
    X = tfidf.fit_transform([doc.text for doc in documents])

    return X.todense(), tfidf, [doc.id for doc in documents]
Beispiel #2
0
def gen_fasttext_vectors(event_name, session):
    f = Path(f'data/fasttext_vectors_event_{event_name}.npy')
    if f.is_file():
        logging.info(f"File exists: data/fasttext_vectors_event_{event_name}.npy")
        return

    nlp = spacy.load('en', parser=False, tagger=False, entity=False)

    documents = get_documents_from_event(event_name, session)

    path = '/home/mquezada/phd/multimedia-summarization/data/word_embeddings/ft_alltweets_model.vec'
    w2v = KeyedVectors.load_word2vec_format(path)

    doc_stream = nlp.pipe([doc.text for doc in documents[:, 0]], n_threads=16)
    doc_vectors = np.empty((len(documents), w2v.vector_size))

    for i, doc in tqdm(enumerate(doc_stream), total=len(documents)):
        doc_vector = []

        # clean documents to the same WE format
        indexes = [m.span() for m in re.finditer('#\w+', doc.text, flags=re.IGNORECASE)]
        for start, end in indexes:
            doc.merge(start_idx=start, end_idx=end)

        for token in doc:
            if token.pos_ == "PUNCT" or \
                    token.is_punct or \
                    token.is_space or \
                    token.text.startswith('@') or \
                    token.like_url:
                continue

            if token.lower_ in w2v:
                doc_vector.append(w2v[token.lower_])

        # representative vector is the average of all words
        vector = np.mean(doc_vector, axis=0)[None]

        doc_vectors[i] = vector

    np.save(f'data/fasttext_vectors_event_{event_name}.npy', arr=doc_vectors)
Beispiel #3
0
    format='%(asctime)s | %(name)s | %(levelname)s : %(message)s',
    level=logging.INFO)
# tokenizer = Tokenizer()

number_summaries = 5

# server, engine = connect_from_rafike(username='******', password='******')
connect = lambda: connect_to_server(username="******",
                                    host="172.17.69.88",
                                    ssh_pkey="/home/luis/.ssh/id_rsa")
with connect() as engine:
    Session = sessionmaker(engine, autocommit=True)
    session = Session()

    # documents = session.query(Document).all()
    documents = events.get_documents_from_event(event_name, session)
    results_dir = Path('results', 'results_more_tweets')

    html_template = '''<blockquote class="twitter-tweet" data-lang="en">
      <a href="https://twitter.com/jack/status/%s"></a>
    </blockquote>'''
    script = '<script async src="http://platform.twitter.com/widgets.js" charset="utf-8"></script>'

    def cmp(d: Document):
        # return d.total_favs + d.total_replies + d.total_rts + d.total_tweets
        return d.total_rts

    clusters = session.query(Cluster).all()

    for i, cluster in tqdm(enumerate(clusters), total=len(clusters)):
        document_cluster = session.query(Document, DocumentCluster) \
import re
from pathlib import Path

from sqlalchemy.orm import sessionmaker

import settings
from db.engines import engine_lmartine as engine
from db.events import get_documents_from_event

"""
Exports all the documents of a event to a csv file.
Considers only the text of the documents
"""

event = "oscar_pistorius"

Session = sessionmaker(engine, autocommit=True)
session = Session()
documents = get_documents_from_event(event, session)
print('Number of Docs: {}'.format(len(documents)))
path_csv = Path(settings.LOCAL_DATA_DIR_2, 'data', event, f'documents_{event}.csv')
with path_csv.open('w') as csv_file:
    docs_text = [doc[0].text.replace('\t', '').replace('\n', '') + '\n' for doc in
                 documents]
    #docs_unique = list(set(docs_text))
    #print(len(docs_unique))
    csv_file.writelines(docs_text)
Beispiel #5
0
def gen_discourse(event_name, a, session):
    f = Path(f'data/discourse_vectors_event_{event_name}_a_{a}.npy')
    if f.is_file():
        logging.info(
            f"File exists: data/discourse_vectors_event_{event_name}_a_{a}.npy"
        )
        return

    nlp = spacy.load('en', parser=False, tagger=False, entity=False)

    documents = get_documents_from_event(event_name, session)

    path = '/home/mquezada/phd/multimedia-summarization/data/word_embeddings/ft_alltweets_model.vec'
    w2v = KeyedVectors.load_word2vec_format(path)

    freq_path = '/home/mquezada/phd/multimedia-summarization/data/word_embeddings/wordfrequencies_relative.tsv'
    freqs = dict()

    pca = PCA(n_components=1)

    with open(freq_path) as f:
        for line in f:
            word, freq = line.split()
            freqs[word] = float(freq)

    doc_stream = nlp.pipe([doc.text for doc in documents[:, 0]], n_threads=16)
    vs = np.empty((len(documents), w2v.vector_size))

    for i, doc in tqdm(enumerate(doc_stream),
                       total=len(documents),
                       desc="creating vectors"):
        doc_vector = []

        # clean documents to the same WE format
        indexes = [
            m.span()
            for m in re.finditer('#\w+', doc.text, flags=re.IGNORECASE)
        ]
        for start, end in indexes:
            doc.merge(start_idx=start, end_idx=end)

        for token in doc:
            if token.pos_ == "PUNCT" or \
                    token.is_punct or \
                    token.is_space or \
                    token.text.startswith('@') or \
                    token.like_url:
                continue

            if token.lower_ in w2v:
                w = token.lower_
                vw = w2v[w]
                pw = freqs[w]

                doc_vector.append(a / (a + pw) * vw)

        # representative vector is the average of all words
        vector = np.mean(doc_vector, axis=0)[None]

        vs[i] = vector

    # all indices
    idx = list(range(len(vs)))
    remove_idx = np.where(np.isnan(vs).any(axis=1))[0]

    final_indices = np.array([i for i in idx if i not in remove_idx])
    vs = np.array([vs[i] for i in idx if i not in remove_idx])

    logging.info("fitting pca")
    pca.fit(vs)
    u = pca.components_

    for i in trange(vs.shape[0], desc="moving vectors"):
        vs[i] = vs[i] - (u.T.dot(u)).dot(vs[i])

    np.save(f'data/discourse_vectors_event_{event_name}_a_{a}.npy', arr=vs)
    np.save(f'data/discourse_vectors_indices_{event_name}_a_{a}.npy',
            arr=final_indices)