コード例 #1
0
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None):
    ''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must
        be a 3-tuple of the picklefile names in the following order:
        
        (title, body, tags)
        
        If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved.
    '''
    utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary()
    for eid in xrange(n):
        for row in row_stream(splits_template % eid):
            ID, title, body, tags = row
            utitledict.doc2bow(title.split(), allow_update=True)
            ubodydict.doc2bow(body.split(), allow_update=True)
            utagdict.doc2bow(tags.split(), allow_update=True)
    
    assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs
    print "Before filtering..."
    print "utitledict:", utitledict
    print "ubodydict:", ubodydict
    print "utagdict:", utagdict
    
    if save_pickle_tup:
        assert len(save_pickle_tup) == 3
        if save_pickle_tup[0]:
            print "saving utitledict..."
            utitledict.save(save_pickle_tup[0])
        if save_pickle_tup[1]:
            print "saving ubodydict..."
            ubodydict.save(save_pickle_tup[1])
        if save_pickle_tup[2]:
            print "saving utagdict..."
            utagdict.save(save_pickle_tup[2])
            
    return (utitledict, ubodydict, utagdict)
コード例 #2
0
def prune_csv_file1(infilename, outfilename, column, gensim_dict):
    ''' Using the (one) provided gensim.corpora.dictionary.Dictionary, prune out
        tokens not found in the filtered dictionary. If filtered examples
        have no tokens, remove them from file. '''
    with open(outfilename, 'w') as f:
        wtr = csv.writer(f, delimiter=',')
        for row in row_stream(infilename):
            tokens = row[column].split()
            filtered_tokens = [token for token in tokens if token in gensim_dict.token2id]
            if not filtered_tokens: # if no tokens remain, remove
                continue
            row[column] = ' '.join(filtered_tokens) # Python generators are un-affected from this
            wtr.writerow(row)
コード例 #3
0
def prune_csv_file2(infilename, outfilename, gdict_tup, col_tup):
    ''' `gdict_tup` must be a 2-tuple of gensim-dicts.
        `col_tup` are the matching selection columns.
        Prune a csv-file with 2 dictionaries simultaneously. '''
    col_a, col_b = col_tup
    gdict_a, gdict_b = gdict_tup
    with open(outfilename, 'w') as f:
        wtr = csv.writer(f, delimiter=',')
        for row in row_stream(infilename):
            tokens_a, tokens_b = row[col_a].split(), row[col_b].split()
            filtered_tokens_a = [token for token in tokens_a if token in gdict_a.token2id]
            filtered_tokens_b = [token for token in tokens_b if token in gdict_b.token2id]
            if not filtered_tokens_a or not filtered_tokens_b:
                continue
            row[col_a] = ' '.join(filtered_tokens_a)
            row[col_b] = ' '.join(filtered_tokens_b)
            wtr.writerow(row)
コード例 #4
0
ファイル: network.py プロジェクト: xqk/tag_recommender
from __future__ import division
from collections import Counter, defaultdict
from gensim.corpora.dictionary import Dictionary
from lib.iterators import row_stream
from itertools import izip

import networkx as nx
from itertools import combinations

common, usefulness = defaultdict(int), defaultdict(int)
total = Dictionary.load("../working/titledict.pickle")

num_eng = 4
for eid in xrange(num_eng):
    for row in row_stream("../data/pruned_Train_%d.csv" % eid):
        ID, title, body, tags = row
        title_tokens = title.split()
        tags = set(tags.split())
        for token in title_tokens:
            if token in tags:
                common[token] += 1

for (hash_id, count) in total.dfs.iteritems():
    token = total[hash_id]
    usefulness[token] = common[token] / count
''' Tag==>Tag recommender '''
G = nx.Graph()

num_eng = 4
for eid in xrange(num_eng):
    for row in row_stream("../data/pruned_Train_%d.csv" % eid):
コード例 #5
0
ファイル: network.py プロジェクト: mr1azl/tag_recommender
from __future__ import division
from collections import Counter, defaultdict
from gensim.corpora.dictionary import Dictionary
from lib.iterators import row_stream
from itertools import izip

import networkx as nx
from itertools import combinations

common, usefulness = defaultdict(int), defaultdict(int)
total = Dictionary.load("../working/titledict.pickle")

num_eng = 4
for eid in xrange(num_eng):
    for row in row_stream("../data/pruned_Train_%d.csv" % eid):
        ID, title, body, tags = row
        title_tokens = title.split()
        tags = set(tags.split())
        for token in title_tokens:
            if token in tags:
                common[token] += 1
            
for (hash_id, count) in total.dfs.iteritems():
    token = total[hash_id]
    usefulness[token] = common[token] / count


''' Tag==>Tag recommender '''
G = nx.Graph()

num_eng = 4