Example #1
0
def vectorizer(tokens, w2v_db):
    db_path = w2v_db
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)
    unsorted_kw = OrderedDict()
    for (w,t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
            
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label
    # Get the vectors of words. Maintain order as in document.
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words.
    conn.close()
    return unsorted_kw, token_vecs
Example #2
0
def processor(name, url, tokens, db_path, json_dir, USE_TITLE_WORDS=False):
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger',
                       'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)

    unsorted_kw = OrderedDict()
    for (w, t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label

    # Get the vectors list
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs)))
    conn.close()

    #Compute cluster centers:
    nk = round(len(token_vecs) / 4)
    data = numpy.array(list(token_vecs.values()))
    cent, _ = kmeans2(data, nk, iter=20, minit='points')
    centroids = cent.tolist()

    # Create the JSON object for this webpage.

    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    json_path = os.path.join(json_dir, name + '.json')
    file_dest = open(json_path, 'w')
    json.dump(
        {
            'url': url,
            'vectors': token_vecs,
            'keyword_frequency': unsorted_kw,
            'centroids': centroids
        }, file_dest)
    file_dest.close()
Example #3
0
def processor(name, url, tokens, db_path,json_dir, USE_TITLE_WORDS = False):
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)

    unsorted_kw = OrderedDict()
    for (w,t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label

    # Get the vectors list
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs)))
    conn.close()

    #Compute cluster centers:
    nk = round(len(token_vecs)/4)
    data = numpy.array(list(token_vecs.values()))
    cent, _ = kmeans2(data,nk,iter=20,minit='points')
    centroids = cent.tolist()

    # Create the JSON object for this webpage.

    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    json_path = os.path.join(json_dir,name+'.json')
    file_dest = open(json_path, 'w')
    json.dump({'url': url, 'vectors' : token_vecs, 'keyword_frequency': unsorted_kw, 'centroids' : centroids}, file_dest)
    file_dest.close()