def get_tfidf_vectors(event_name, eventgroup_id, session): logging.info("loading documents from DB") documents = events.get_documents_from_event(event_name, session) documents = documents[:, 0] logging.info("generating tfidf vectors") tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X = tfidf.fit_transform([doc.text for doc in documents]) return X.todense(), tfidf, [doc.id for doc in documents]
def gen_fasttext_vectors(event_name, session): f = Path(f'data/fasttext_vectors_event_{event_name}.npy') if f.is_file(): logging.info(f"File exists: data/fasttext_vectors_event_{event_name}.npy") return nlp = spacy.load('en', parser=False, tagger=False, entity=False) documents = get_documents_from_event(event_name, session) path = '/home/mquezada/phd/multimedia-summarization/data/word_embeddings/ft_alltweets_model.vec' w2v = KeyedVectors.load_word2vec_format(path) doc_stream = nlp.pipe([doc.text for doc in documents[:, 0]], n_threads=16) doc_vectors = np.empty((len(documents), w2v.vector_size)) for i, doc in tqdm(enumerate(doc_stream), total=len(documents)): doc_vector = [] # clean documents to the same WE format indexes = [m.span() for m in re.finditer('#\w+', doc.text, flags=re.IGNORECASE)] for start, end in indexes: doc.merge(start_idx=start, end_idx=end) for token in doc: if token.pos_ == "PUNCT" or \ token.is_punct or \ token.is_space or \ token.text.startswith('@') or \ token.like_url: continue if token.lower_ in w2v: doc_vector.append(w2v[token.lower_]) # representative vector is the average of all words vector = np.mean(doc_vector, axis=0)[None] doc_vectors[i] = vector np.save(f'data/fasttext_vectors_event_{event_name}.npy', arr=doc_vectors)
format='%(asctime)s | %(name)s | %(levelname)s : %(message)s', level=logging.INFO) # tokenizer = Tokenizer() number_summaries = 5 # server, engine = connect_from_rafike(username='******', password='******') connect = lambda: connect_to_server(username="******", host="172.17.69.88", ssh_pkey="/home/luis/.ssh/id_rsa") with connect() as engine: Session = sessionmaker(engine, autocommit=True) session = Session() # documents = session.query(Document).all() documents = events.get_documents_from_event(event_name, session) results_dir = Path('results', 'results_more_tweets') html_template = '''<blockquote class="twitter-tweet" data-lang="en"> <a href="https://twitter.com/jack/status/%s"></a> </blockquote>''' script = '<script async src="http://platform.twitter.com/widgets.js" charset="utf-8"></script>' def cmp(d: Document): # return d.total_favs + d.total_replies + d.total_rts + d.total_tweets return d.total_rts clusters = session.query(Cluster).all() for i, cluster in tqdm(enumerate(clusters), total=len(clusters)): document_cluster = session.query(Document, DocumentCluster) \
import re from pathlib import Path from sqlalchemy.orm import sessionmaker import settings from db.engines import engine_lmartine as engine from db.events import get_documents_from_event """ Exports all the documents of a event to a csv file. Considers only the text of the documents """ event = "oscar_pistorius" Session = sessionmaker(engine, autocommit=True) session = Session() documents = get_documents_from_event(event, session) print('Number of Docs: {}'.format(len(documents))) path_csv = Path(settings.LOCAL_DATA_DIR_2, 'data', event, f'documents_{event}.csv') with path_csv.open('w') as csv_file: docs_text = [doc[0].text.replace('\t', '').replace('\n', '') + '\n' for doc in documents] #docs_unique = list(set(docs_text)) #print(len(docs_unique)) csv_file.writelines(docs_text)
def gen_discourse(event_name, a, session): f = Path(f'data/discourse_vectors_event_{event_name}_a_{a}.npy') if f.is_file(): logging.info( f"File exists: data/discourse_vectors_event_{event_name}_a_{a}.npy" ) return nlp = spacy.load('en', parser=False, tagger=False, entity=False) documents = get_documents_from_event(event_name, session) path = '/home/mquezada/phd/multimedia-summarization/data/word_embeddings/ft_alltweets_model.vec' w2v = KeyedVectors.load_word2vec_format(path) freq_path = '/home/mquezada/phd/multimedia-summarization/data/word_embeddings/wordfrequencies_relative.tsv' freqs = dict() pca = PCA(n_components=1) with open(freq_path) as f: for line in f: word, freq = line.split() freqs[word] = float(freq) doc_stream = nlp.pipe([doc.text for doc in documents[:, 0]], n_threads=16) vs = np.empty((len(documents), w2v.vector_size)) for i, doc in tqdm(enumerate(doc_stream), total=len(documents), desc="creating vectors"): doc_vector = [] # clean documents to the same WE format indexes = [ m.span() for m in re.finditer('#\w+', doc.text, flags=re.IGNORECASE) ] for start, end in indexes: doc.merge(start_idx=start, end_idx=end) for token in doc: if token.pos_ == "PUNCT" or \ token.is_punct or \ token.is_space or \ token.text.startswith('@') or \ token.like_url: continue if token.lower_ in w2v: w = token.lower_ vw = w2v[w] pw = freqs[w] doc_vector.append(a / (a + pw) * vw) # representative vector is the average of all words vector = np.mean(doc_vector, axis=0)[None] vs[i] = vector # all indices idx = list(range(len(vs))) remove_idx = np.where(np.isnan(vs).any(axis=1))[0] final_indices = np.array([i for i in idx if i not in remove_idx]) vs = np.array([vs[i] for i in idx if i not in remove_idx]) logging.info("fitting pca") pca.fit(vs) u = pca.components_ for i in trange(vs.shape[0], desc="moving vectors"): vs[i] = vs[i] - (u.T.dot(u)).dot(vs[i]) np.save(f'data/discourse_vectors_event_{event_name}_a_{a}.npy', arr=vs) np.save(f'data/discourse_vectors_indices_{event_name}_a_{a}.npy', arr=final_indices)