def _get_wv(sentence): ''' get word2vec data by sentence sentence is segmented string. ''' global _vectors vectors = [] for y in sentence.split(): y_ = any2unicode(y).strip() if y_ not in _stopwords: syns = nearby(y_)[0] # print("sentence %s word: %s" %(sentence, y_)) # print("sentence %s word nearby: %s" %(sentence, " ".join(syns))) c = [] try: c.append(_vectors.word_vec(y_)) except KeyError as error: print("not exist in w2v model: %s" % y_) c.append(np.zeros((100, ), dtype=float)) for n in syns: if n is None: continue try: v = _vectors.word_vec(any2unicode(n)) except KeyError as error: v = np.zeros((100, ), dtype=float) c.append(v) r = np.average(c, axis=0) vectors.append(r) return vectors
def add_word_to_vocab(word, nearby, nearby_score): ''' Add word into vocab by word, nearby lis and nearby_score lis ''' global _size if word is not None: if PLT == 2: word = any2unicode(word) nearby = [any2unicode(z) for z in nearby] _vocab[word] = [nearby, nearby_score] _size += 1
def nearby(word): ''' Nearby word ''' try: return _vocab[any2unicode(word)] except KeyError as e: return [[], []]
def _load_stopwords(file_path): ''' load stop words ''' global _stopwords words = open(file_path, 'r') stopwords = words.readlines() for w in stopwords: _stopwords.add(any2unicode(w).strip())
def _load_stopwords(file_path): ''' load stop words ''' global _stopwords if sys.version_info[0] < 3: words = open(file_path, 'r') else: words = open(file_path, 'r', encoding='utf-8') stopwords = words.readlines() for w in stopwords: _stopwords.add(any2unicode(w).strip())
def crawl_url(browser, url , savedir, debug=False, delay=0): html = url2filenames(url) browser.LoadUrl(escape_url(url), synchronous=True) logger.info('Fetched '+url) if delay: time.sleep(delay) # save raw page htmlfile = os.path.join(savedir, html) with open(htmlfile, 'wb') as srcfp: html = browser.getSource(True) # synchronous get assert(html) srcfp.write(html) logger.debug('Wrote to %s' % htmlfile) # parse page for features, get attribute table logger.info('Extracting features') header, attributes, dom, bodyhtml = collect_features(browser) logger.debug('%d elements with features extracted' % len(attributes)) # write as CSV csvfile = htmlfile + '.csv' with open(csvfile, 'wb') as csvfp: csvfp.write(codecs.BOM_UTF8) # Excel requires BOM csvout = UnicodeWriter(csvfp) csvout.writerow(header) csvout.writerows([[any2unicode(x) for x in row] for row in attributes]) logger.info('Wrote to %s' % csvfile) if debug: lxmlfile = htmlfile+'.lxml' domfile = htmlfile+'.raw.csv' with open(domfile, 'wb') as csvfp: # write DOM csv as recognized by JS csvfp.write(codecs.BOM_UTF8) # Excel requires BOM csvout = UnicodeWriter(csvfp) csvout.writerow( "xpath display visible x y width height fgcolor bgcolor fontsize " "textonly htmlcode".split() ) csvout.writerows([[any2unicode(x) for x in row] for row in dom]) logger.info('Wrote to %s' % domfile) with open(lxmlfile, 'wb') as fp: fp.write(bodyhtml.encode('utf8')) logger.info('Wrote to %s' % lxmlfile)