def build_mention_dict(): mentions_dict_filename = os.path.join(LOCAL_ROOT, 'mentions.dict') mentions = itertools.chain( read_json_data('training', 'mentions.json.gz'), read_json_data('validation', 'mentions.json.gz'), read_json_data('test', 'mentions.json.gz'), ) documents = (record['mentions'] for record in mentions) mentions_dict = Dictionary.from_documents(documents) mentions_dict.save(mentions_dict_filename)
def build_entity_dict(): entities_dict_filename = os.path.join(LOCAL_ROOT, 'entities.dict') outgoing = itertools.chain( read_json_data('training', 'outgoing.json.gz'), read_json_data('validation', 'outgoing.json.gz'), read_json_data('test', 'outgoing.json.gz'), ) documents = (record['outgoing_entity_ids'] for record in outgoing) entities_dict = Dictionary.from_documents(documents) entities_dict.save(entities_dict_filename)
def create_dct(df: pd.DataFrame, bigram: gensim.models.phrases.Phraser, trigram: gensim.models.phrases.Phraser, save: bool = False): """ Create dictionary from dataframe Input: - df: dataframe with column "text" - bigram: bigram phraser - trigram: trigram phraser - save: if true, vocabulary is saved in files """ def wrapper_phrase(generator): for item in generator: ngram = trigram[bigram[item.text.split(" ")]] yield ngram dct = Dictionary.from_documents(wrapper_phrase(df.itertuples())) dct.filter_extremes(no_below=1000, no_above=0.80, keep_n=150000) if save == True: dct.save_as_text("./gensim_dct.txt") dct.save("./gensim_dct")
import pandas as pd from gensim.models import TfidfModel from gensim.corpora import Dictionary from twip.constant import DATA_PATH np = pd.np dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python') nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python') with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f: df = pd.DataFrame.from_csv(f, encoding='utf8') d = Dictionary.from_documents(df.tokens) # fail df.tokens[0] df.tokens df.tokens.iloc[0] df['tokens'] = df.txt.str.split() df.tokens df.tokens.iloc[0] d = Dictionary.from_documents(df.txt.str.split()) len(d) tfidf = TfidfModel(d) tfidf = TfidfModel(dictionary=d) tfidf len(tfidf) df.tokens[0]
def build_dict(docs): dictionay = Dictionary.from_documents(docs) return dictionay
# Load previously cleaned data # In[6]: dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python') nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python') with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f: df = pd.DataFrame.from_csv(f, encoding='utf8') df.tokens # In[7]: d = Dictionary.from_documents(df.tokens) # In[11]: df.tokens.iloc[0] # When we said "QUOTE_NONNUMERIC" we didn't mean **ALL** nonnumeric fields ;) # In[16]: df['tokens'] = df.txt.str.split() df.tokens