Exemple #1
0
        if title in pages:  #if the page was added to the pages, then get some sample queries and next page

            for link in pages[title]['links']:
                if link not in par_pages:  # Check if the page was already visited in this pages.
                    next_pages.append(link)
                    # Get the parent page for the supervision signal.
                    par_pages[link] = par_pages[title] + [title]

            # only get queries up to max_hops. Higher hops will be used to get other pages only.
            if hops <= np.asarray(prm.max_hops).max():
                if ("category:" not in title) and (
                        hops >= 1
                ):  # do not chose queries from categories or if it less than one hop
                    # compute TF
                    tf = utils.compute_tf(
                        wordpunct_tokenize(pages[title]['text'].decode(
                            'ascii', 'ignore')), vocab)
                    # Get sentences
                    sents_pre = tokenizer.tokenize(pages[title]['text'].decode(
                        'ascii', 'ignore'))
                    sents = []
                    n_consec = min(len(sents_pre), prm.n_consec)
                    for sk in range(0, len(sents_pre) - n_consec + 1):
                        sent = ''
                        for sj in range(n_consec):
                            sent += ' ' + sents_pre[sk + sj]
                        sents.append(sent.strip())
                    sents_filtered = []
                    for sent in sents:
                        n_words_sent = utils.n_words(
                            wordpunct_tokenize(sent.lower()), vocab)
              ftemp.write(log_txt+'\n')
          print log_txt

        if title in pages: #if the page was added to the pages, then get some sample queries and next page

            for link in pages[title]['links']:
                if link not in par_pages: # Check if the page was already visited in this pages.
                    next_pages.append(link)
                    # Get the parent page for the supervision signal.
                    par_pages[link] = par_pages[title] + [title]
                       
            # only get queries up to max_hops. Higher hops will be used to get other pages only.
            if hops <= np.asarray(prm.max_hops).max():
                if ("index" not in pages[title]['title']) and (hops >= 1): # do not chose queries from categories or if it less than one hops
                    # compute TF
                    tf = utils.compute_tf(wordpunct_tokenize(pages[title]['text'].decode('ascii', 'ignore')), vocab)
                    # Get sentences
                    sents_pre = tokenizer.tokenize(pages[title]['text'].decode('ascii', 'ignore'))
                    sents = []
                    n_consec = min(len(sents_pre), prm.n_consec)
                    for sk in range(0,len(sents_pre)-n_consec+1):
                        sent = ''
                        for sj in range(n_consec):
                            sent += ' ' + sents_pre[sk+sj]
                        sents.append(sent.strip())
                    sents_filtered = []
                    for sent in sents:
                        n_words_sent = utils.n_words(wordpunct_tokenize(sent.lower()), vocab)
                        if n_words_sent >= prm.min_words_query and n_words_sent <= prm.n_consec*prm.max_words_query: #only add if the sentence has between 10 known and 30 known words
                            sents_filtered.append(sent)