def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None): ''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must be a 3-tuple of the picklefile names in the following order: (title, body, tags) If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved. ''' utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary() for eid in xrange(n): for row in row_stream(splits_template % eid): ID, title, body, tags = row utitledict.doc2bow(title.split(), allow_update=True) ubodydict.doc2bow(body.split(), allow_update=True) utagdict.doc2bow(tags.split(), allow_update=True) assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs print "Before filtering..." print "utitledict:", utitledict print "ubodydict:", ubodydict print "utagdict:", utagdict if save_pickle_tup: assert len(save_pickle_tup) == 3 if save_pickle_tup[0]: print "saving utitledict..." utitledict.save(save_pickle_tup[0]) if save_pickle_tup[1]: print "saving ubodydict..." ubodydict.save(save_pickle_tup[1]) if save_pickle_tup[2]: print "saving utagdict..." utagdict.save(save_pickle_tup[2]) return (utitledict, ubodydict, utagdict)
def prune_csv_file1(infilename, outfilename, column, gensim_dict): ''' Using the (one) provided gensim.corpora.dictionary.Dictionary, prune out tokens not found in the filtered dictionary. If filtered examples have no tokens, remove them from file. ''' with open(outfilename, 'w') as f: wtr = csv.writer(f, delimiter=',') for row in row_stream(infilename): tokens = row[column].split() filtered_tokens = [token for token in tokens if token in gensim_dict.token2id] if not filtered_tokens: # if no tokens remain, remove continue row[column] = ' '.join(filtered_tokens) # Python generators are un-affected from this wtr.writerow(row)
def prune_csv_file2(infilename, outfilename, gdict_tup, col_tup): ''' `gdict_tup` must be a 2-tuple of gensim-dicts. `col_tup` are the matching selection columns. Prune a csv-file with 2 dictionaries simultaneously. ''' col_a, col_b = col_tup gdict_a, gdict_b = gdict_tup with open(outfilename, 'w') as f: wtr = csv.writer(f, delimiter=',') for row in row_stream(infilename): tokens_a, tokens_b = row[col_a].split(), row[col_b].split() filtered_tokens_a = [token for token in tokens_a if token in gdict_a.token2id] filtered_tokens_b = [token for token in tokens_b if token in gdict_b.token2id] if not filtered_tokens_a or not filtered_tokens_b: continue row[col_a] = ' '.join(filtered_tokens_a) row[col_b] = ' '.join(filtered_tokens_b) wtr.writerow(row)
from __future__ import division from collections import Counter, defaultdict from gensim.corpora.dictionary import Dictionary from lib.iterators import row_stream from itertools import izip import networkx as nx from itertools import combinations common, usefulness = defaultdict(int), defaultdict(int) total = Dictionary.load("../working/titledict.pickle") num_eng = 4 for eid in xrange(num_eng): for row in row_stream("../data/pruned_Train_%d.csv" % eid): ID, title, body, tags = row title_tokens = title.split() tags = set(tags.split()) for token in title_tokens: if token in tags: common[token] += 1 for (hash_id, count) in total.dfs.iteritems(): token = total[hash_id] usefulness[token] = common[token] / count ''' Tag==>Tag recommender ''' G = nx.Graph() num_eng = 4 for eid in xrange(num_eng): for row in row_stream("../data/pruned_Train_%d.csv" % eid):
from __future__ import division from collections import Counter, defaultdict from gensim.corpora.dictionary import Dictionary from lib.iterators import row_stream from itertools import izip import networkx as nx from itertools import combinations common, usefulness = defaultdict(int), defaultdict(int) total = Dictionary.load("../working/titledict.pickle") num_eng = 4 for eid in xrange(num_eng): for row in row_stream("../data/pruned_Train_%d.csv" % eid): ID, title, body, tags = row title_tokens = title.split() tags = set(tags.split()) for token in title_tokens: if token in tags: common[token] += 1 for (hash_id, count) in total.dfs.iteritems(): token = total[hash_id] usefulness[token] = common[token] / count ''' Tag==>Tag recommender ''' G = nx.Graph() num_eng = 4