Ejemplo n.º 1
0
def _get_wv(sentence):
    '''
    get word2vec data by sentence
    sentence is segmented string.
    '''
    global _vectors
    vectors = []
    for y in sentence.split():
        y_ = any2unicode(y).strip()
        if y_ not in _stopwords:
            syns = nearby(y_)[0]
            # print("sentence %s word: %s" %(sentence, y_))
            # print("sentence %s word nearby: %s" %(sentence, " ".join(syns)))
            c = []
            try:
                c.append(_vectors.word_vec(y_))
            except KeyError as error:
                print("not exist in w2v model: %s" % y_)
                c.append(np.zeros((100, ), dtype=float))
            for n in syns:
                if n is None: continue
                try:
                    v = _vectors.word_vec(any2unicode(n))
                except KeyError as error:
                    v = np.zeros((100, ), dtype=float)
                c.append(v)
            r = np.average(c, axis=0)
            vectors.append(r)
    return vectors
Ejemplo n.º 2
0
def add_word_to_vocab(word, nearby, nearby_score):
    '''
    Add word into vocab by word, nearby lis and nearby_score lis
    '''
    global _size
    if word is not None:
        if PLT == 2:
            word = any2unicode(word)
            nearby = [any2unicode(z) for z in nearby]
        _vocab[word] = [nearby, nearby_score]
        _size += 1
Ejemplo n.º 3
0
def nearby(word):
    '''
    Nearby word
    '''
    try:
        return _vocab[any2unicode(word)]
    except KeyError as e:
        return [[], []]
Ejemplo n.º 4
0
def _load_stopwords(file_path):
    '''
    load stop words
    '''
    global _stopwords
    words = open(file_path, 'r')
    stopwords = words.readlines()
    for w in stopwords:
        _stopwords.add(any2unicode(w).strip())
Ejemplo n.º 5
0
def _load_stopwords(file_path):
    '''
    load stop words
    '''
    global _stopwords
    if sys.version_info[0] < 3:
        words = open(file_path, 'r')
    else:
        words = open(file_path, 'r', encoding='utf-8')
    stopwords = words.readlines()
    for w in stopwords:
        _stopwords.add(any2unicode(w).strip())
Ejemplo n.º 6
0
def crawl_url(browser, url , savedir, debug=False, delay=0):
    html = url2filenames(url)
    browser.LoadUrl(escape_url(url), synchronous=True)
    logger.info('Fetched '+url)
    if delay:
        time.sleep(delay)
    # save raw page
    htmlfile = os.path.join(savedir, html)
    with open(htmlfile, 'wb') as srcfp:
        html = browser.getSource(True) # synchronous get
        assert(html)
        srcfp.write(html)
        logger.debug('Wrote to %s' % htmlfile)
    # parse page for features, get attribute table
    logger.info('Extracting features')
    header, attributes, dom, bodyhtml = collect_features(browser)
    logger.debug('%d elements with features extracted' % len(attributes))
    # write as CSV
    csvfile = htmlfile + '.csv'
    with open(csvfile, 'wb') as csvfp:
        csvfp.write(codecs.BOM_UTF8) # Excel requires BOM
        csvout = UnicodeWriter(csvfp)
        csvout.writerow(header)
        csvout.writerows([[any2unicode(x) for x in row] for row in attributes])
    logger.info('Wrote to %s' % csvfile)
    if debug:
        lxmlfile = htmlfile+'.lxml'
        domfile = htmlfile+'.raw.csv'
        with open(domfile, 'wb') as csvfp: # write DOM csv as recognized by JS
            csvfp.write(codecs.BOM_UTF8) # Excel requires BOM
            csvout = UnicodeWriter(csvfp)
            csvout.writerow(
                "xpath display visible x y width height fgcolor bgcolor fontsize "
                "textonly htmlcode".split()
            )
            csvout.writerows([[any2unicode(x) for x in row] for row in dom])
        logger.info('Wrote to %s' % domfile)
        with open(lxmlfile, 'wb') as fp:
            fp.write(bodyhtml.encode('utf8'))
        logger.info('Wrote to %s' % lxmlfile)