def vectorizer(tokens, w2v_db): db_path = w2v_db # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w,t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors of words. Maintain order as in document. token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words. conn.close() return unsorted_kw, token_vecs
def processor(name, url, tokens, db_path, json_dir, USE_TITLE_WORDS=False): # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w, t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors list token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) conn.close() #Compute cluster centers: nk = round(len(token_vecs) / 4) data = numpy.array(list(token_vecs.values())) cent, _ = kmeans2(data, nk, iter=20, minit='points') centroids = cent.tolist() # Create the JSON object for this webpage. if not os.path.exists(json_dir): os.makedirs(json_dir) json_path = os.path.join(json_dir, name + '.json') file_dest = open(json_path, 'w') json.dump( { 'url': url, 'vectors': token_vecs, 'keyword_frequency': unsorted_kw, 'centroids': centroids }, file_dest) file_dest.close()
def processor(name, url, tokens, db_path,json_dir, USE_TITLE_WORDS = False): # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w,t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors list token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) conn.close() #Compute cluster centers: nk = round(len(token_vecs)/4) data = numpy.array(list(token_vecs.values())) cent, _ = kmeans2(data,nk,iter=20,minit='points') centroids = cent.tolist() # Create the JSON object for this webpage. if not os.path.exists(json_dir): os.makedirs(json_dir) json_path = os.path.join(json_dir,name+'.json') file_dest = open(json_path, 'w') json.dump({'url': url, 'vectors' : token_vecs, 'keyword_frequency': unsorted_kw, 'centroids' : centroids}, file_dest) file_dest.close()