if title in pages: #if the page was added to the pages, then get some sample queries and next page for link in pages[title]['links']: if link not in par_pages: # Check if the page was already visited in this pages. next_pages.append(link) # Get the parent page for the supervision signal. par_pages[link] = par_pages[title] + [title] # only get queries up to max_hops. Higher hops will be used to get other pages only. if hops <= np.asarray(prm.max_hops).max(): if ("category:" not in title) and ( hops >= 1 ): # do not chose queries from categories or if it less than one hop # compute TF tf = utils.compute_tf( wordpunct_tokenize(pages[title]['text'].decode( 'ascii', 'ignore')), vocab) # Get sentences sents_pre = tokenizer.tokenize(pages[title]['text'].decode( 'ascii', 'ignore')) sents = [] n_consec = min(len(sents_pre), prm.n_consec) for sk in range(0, len(sents_pre) - n_consec + 1): sent = '' for sj in range(n_consec): sent += ' ' + sents_pre[sk + sj] sents.append(sent.strip()) sents_filtered = [] for sent in sents: n_words_sent = utils.n_words( wordpunct_tokenize(sent.lower()), vocab)
ftemp.write(log_txt+'\n') print log_txt if title in pages: #if the page was added to the pages, then get some sample queries and next page for link in pages[title]['links']: if link not in par_pages: # Check if the page was already visited in this pages. next_pages.append(link) # Get the parent page for the supervision signal. par_pages[link] = par_pages[title] + [title] # only get queries up to max_hops. Higher hops will be used to get other pages only. if hops <= np.asarray(prm.max_hops).max(): if ("index" not in pages[title]['title']) and (hops >= 1): # do not chose queries from categories or if it less than one hops # compute TF tf = utils.compute_tf(wordpunct_tokenize(pages[title]['text'].decode('ascii', 'ignore')), vocab) # Get sentences sents_pre = tokenizer.tokenize(pages[title]['text'].decode('ascii', 'ignore')) sents = [] n_consec = min(len(sents_pre), prm.n_consec) for sk in range(0,len(sents_pre)-n_consec+1): sent = '' for sj in range(n_consec): sent += ' ' + sents_pre[sk+sj] sents.append(sent.strip()) sents_filtered = [] for sent in sents: n_words_sent = utils.n_words(wordpunct_tokenize(sent.lower()), vocab) if n_words_sent >= prm.min_words_query and n_words_sent <= prm.n_consec*prm.max_words_query: #only add if the sentence has between 10 known and 30 known words sents_filtered.append(sent)