Ejemplo n.º 1
0
def parse_page(title,
               text,
               vocab,
               inlinks,
               article_min_words=1,
               ignore_namespace=True):
    filtered_text = filter_wiki(text,
                                promote_remaining=False,
                                simplify_links=False)
    tok_num = 0
    # Presave the interlinks in this page (for DCA training)
    founded_links = []
    # The end of previous link
    prev = 0
    for match in RE_P16.finditer(filtered_text):
        # Tokenize sentences between the previous link and the current link
        link_start, link_end = match.span()
        if prev < link_start:
            toks = tokenize(filtered_text[prev:link_start])
            prev = link_end
            tok_num += len(toks)

        # Tokenize current link
        parts = match.groups()[0].split('|')
        entity = parts[0]
        if '[' not in entity and ']' not in entity:
            try:
                interlink_text = parts[1]
            except IndexError:
                interlink_text = entity
            if interlink_text:
                link_toks = tokenize(interlink_text)
            else:
                link_toks = []
            tok_num += len(link_toks)
            entity = entity.replace(' ', '_')
            if entity in vocab:
                founded_links.append(entity)

    # Tokenize remaining text
    toks = tokenize(filtered_text[prev:])
    tok_num += len(toks)

    # Filter
    if tok_num < article_min_words:
        return False
    if ignore_namespace:
        if any(title.startswith(name + ':') for name in IGNORED_NAMESPACES):
            return False

    # update inlinks
    for entity in founded_links:
        if entity not in inlinks:
            inlinks[entity] = set()
        inlinks[entity].add(title)
    return True
Ejemplo n.º 2
0
def build_glove(word2vec, target_files, output_path):
    word2vec1 = KeyedVectors(vector_size=300)
    print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size))
    buf1 = []
    buf2 = []
    contains = set()

    def add_buffer(w, f):
        nonlocal buf1, buf2
        if w not in contains:
            buf1.append(w)
            buf2.append(f)
            contains.add(w)

    def clear_buffer():
        nonlocal buf1, buf2
        buf1 = []
        buf2 = []

    for f in target_files:
        for i, s in enumerate(load_json(f), 1):
            sentence = s['description']

            for w in tokenize(sentence):
                w = w.lower()
                if w in word2vec:
                    add_buffer(w, word2vec[w])
            if i % 10 == 0 and len(buf1) > 0:
                word2vec1.add(buf1, buf2, replace=False)
                clear_buffer()
    if len(buf1) > 0:
        word2vec1.add(buf1, buf2, replace=False)

    print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size))
    KeyedVectors.save_word2vec_format(word2vec1, output_path, binary=True)
Ejemplo n.º 3
0
def Pdf2Vec(titles):
    '''
	Vectorizes a given PDF on your local filesystem to a Log Entropy TF-IDF
	vector to then query against your similarity index

	Returns:

	[document-logent-vec-1, document-logent-vec-2, ... ,document-logent-vec-N]
	where N is is the number of titles
	'''
    #TODO: Make it so you can give a model as an arguement to vecorize a given
    #document into any trained gensim model

    ret_lst = []
    logent = LogEntropyModel.load('../models/logEntropy.model')
    diction = Dictionary.load('../models/wiki_dict.dict')
    for title in titles:
        curr_file = open('../data/articleData/pdfs/' + title + '.pdf')
        doc = slate.PDF(curr_file)
        doc = ' '.join(doc)
        doc_tokens = wikicorpus.tokenize(doc)
        bow = diction.doc2bow(doc_tokens)
        bow_logent = logent[bow]
        ret_lst.append(bow_logent)
        curr_file.close()

    return ret_lst
def _process_page(page):
    assert len(page['section_titles']) == len(page['section_texts'])
    assert len(page.keys()) == 3

    rows = [page['title']]

    for section_title, section_text in zip(page['section_titles'],
                                           page['section_texts']):
        rows.append(section_title)
        rows.append(section_text)

    page_tok = tokenize('\n'.join(rows))
    return ' '.join(page_tok) + '\n'
Ejemplo n.º 5
0
def download_articles(article_names):
    with open(ARTICLES_FILE, 'w') as articles_file:
        with open(INDEX_FILE, 'w') as index_file:
            for article_name in article_names:
                print "Processing {0}".format(article_name)
                try:
                    text = util.get_article_text(article_name)
                    tokenized_article = tokenize(text)
                    tokenized_article = [w for w in tokenized_article if not w in util.STOP_WORDS]
                    for token in tokenized_article:
                        articles_file.write("{0} ".format(token))
                    articles_file.write("\n")
                    index_file.write("{0}\n".format(article_name))

                except util.ArticleNotFoundError:
                    print "Could not find: {0}".format(article_name)
                except:
                    print "Error getting article"
Ejemplo n.º 6
0
def _article_tfidf(lang, article_title):
    text = _article_text(lang, article_title)
    if text is None:
        return None
    else:
        return dict(_tfidf[_dict.doc2bow(tokenize(filter_wiki(text)))])
Ejemplo n.º 7
0
 def tfidf_similarity_query(title, content):
     tokens = wikicorpus.tokenize(wikicorpus.filter_wiki(content))
     vector = dict(tfidf[dictionary.doc2bow(tokens)])
     return cosine_similarity(seed_article, vector)
Ejemplo n.º 8
0
 def word_portion_query(title, content):
     words = [word.decode('utf-8') for word in
              wikicorpus.tokenize(wikicorpus.filter_wiki(content))]
     words_in_keywords = sum(word in keywords for word in words)
     return safe_ratio(words_in_keywords, len(words))
Ejemplo n.º 9
0
 def keyword_portion_query(title, content):
     word_set = {word.decode('utf-8') for word in
                 wikicorpus.tokenize(wikicorpus.filter_wiki(content))}
     keywords_in_words = sum(keyword in word_set for keyword in keywords)
     return safe_ratio(keywords_in_words, len(keywords))
Ejemplo n.º 10
0
 def _prepare_description(cls, game):
     title = game['title']
     description = game['description']['full']
     whats_cool = game['description']['whats_cool_about_it']
     text = f'{title} {description} {whats_cool}'
     return tokenize(cls._cleanhtml(text))
Ejemplo n.º 11
0
def parse_page(title,
               text,
               vocab,
               page_entity_word_co_occur,
               context_entity_word_co_occur,
               word_count,
               entity_window_size=20,
               article_min_words=1,
               ignore_namespace=True):
    filtered_text = filter_wiki(text,
                                promote_remaining=False,
                                simplify_links=False)
    tok_num = 0
    # Collect words in this page
    batch_words = []
    # Presave the index of entities in this page (for entity pre-training)
    founded_entity_idx = []
    # Presave the interlinks in this page (for DCA training)
    prev = 0
    for match in RE_P16.finditer(filtered_text):
        # Tokenize sentences between the previous link and the current link
        link_start, link_end = match.span()
        if prev < link_start:
            toks = tokenize(filtered_text[prev:link_start], lower=False)
            for tok in toks:
                batch_words.append([tok])
            prev = link_end
            tok_num += len(toks)

        # Tokenize current link
        parts = match.groups()[0].split('|')
        entity = parts[0]
        if '[' not in entity and ']' not in entity:
            try:
                interlink_text = parts[1]
            except IndexError:
                interlink_text = entity
            if interlink_text:
                link_toks = tokenize(interlink_text, lower=False)
            else:
                link_toks = []
            batch_words.append(link_toks)
            tok_num += len(link_toks)
            entity = entity.replace(' ', '_')
            if entity in vocab:
                founded_entity_idx.append([len(batch_words) - 1, entity])

    # Tokenize remaining text
    toks = tokenize(filtered_text[prev:], lower=False)
    for tok in toks:
        batch_words.append([tok])
    tok_num += len(toks)

    # Filter
    if tok_num < article_min_words:
        return False
    if ignore_namespace:
        if any(title.startswith(name + ':') for name in IGNORED_NAMESPACES):
            return False

    # Start counting
    if title in vocab:
        if title not in page_entity_word_co_occur:
            page_entity_word_co_occur[title] = set()
        we = page_entity_word_co_occur[title]
    else:
        we = None

    for words in batch_words:
        for word in words:
            # Count occurrence #(w,)
            word_count[word] = word_count.get(word, 0) + 1
            if we is not None:
                # Count co-occurrence #(w,e) according to rule (i)
                we.add(word)

    half_entity_window_size = entity_window_size // 2
    for idx, entity in founded_entity_idx:
        if entity not in context_entity_word_co_occur:
            context_entity_word_co_occur[entity] = set()
        we = context_entity_word_co_occur[entity]
        # look toward left
        for j in range(max(idx - half_entity_window_size, 0), idx):
            for word in batch_words[j]:
                # Count #(w,e) according to rule (ii)
                we.add(word)
        # look toward right
        for j in range(idx + 1,
                       min(idx + half_entity_window_size, len(batch_words))):
            for word in batch_words[j]:
                # Count #(w,e) according to rule (ii)
                we.add(word)
    return True
Ejemplo n.º 12
0
 def normalizeText(self, text):
     words = tokenize(filter_wiki(text.lower()))
     text = " ".join(words)
     return text
def tokenize_gensim(text):
    tokenize(text)
Ejemplo n.º 14
0
 def _get_query(self, text):
     """Preprocess and tokenize text, return it as BOW (bag of words).  """
     return self.dictionary.doc2bow(
         wikicorpus.tokenize(wikicorpus.filter_wiki(text)))
def get_tokens(text, token_min_len=1, token_max_len=100):
    text = filter_wiki(text)
    return tokenize(text, token_min_len, token_max_len, True)        
Ejemplo n.º 16
0
def tokenize_with_stemming(*params, **kwparams):
    tokens = tokenize(*params, **kwparams)
    tokens = cut_off_references(tokens)
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens