Example #1
0
 def __iter__(self):
     #for line in open('ebola-raw.txt'):
     for line in open('testdata01.txt'):
         #for line in open('twitter2Mb.txt'):
         line = re.sub('<[^>]+>', '', line)
         utils.lemmatize(line)
         # assume there's one document per line, tokens separated by whitespace
         yield dictionary.doc2bow(line.lower().split())
Example #2
0
def lemmatize_an_idea(idea, use_stoplist=True):
    if idea in lemma_dict:
        return lemma_dict[idea]
    if use_stoplist:
        lemm = [lem[:-3] for lem in lemmatize(idea) if lem[:-3] not in stoplist]
    else:
        lemm = [lem[:-3] for lem in lemmatize(idea) if lem[:-3]]
    lemma_dict[idea] = lemm
    return lemm
Example #3
0
def lemmatize_an_idea(idea, use_stoplist=True):
    if idea in lemma_dict:
        return lemma_dict[idea]
    if use_stoplist:
        lemm = [
            lem[:-3] for lem in lemmatize(idea) if lem[:-3] not in stoplist
        ]
    else:
        lemm = [lem[:-3] for lem in lemmatize(idea) if lem[:-3]]
    lemma_dict[idea] = lemm
    return lemm
Example #4
0
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize, title, pageid = args
    categories = get_categories(text)
    if not list(set(categories).intersection(input_categories)):
        return None, None, None, None
    text = filter_wiki(text)
    sentences = sentence_tokenize(text)
    title = title.replace(' ', '_')
    paragraphs = {}

    # Split document into paragraphs
    # sentences = [s0, s1, t0, s2, t1, ...]
    paragraph_title = [title]
    level = 1
    this_sentences = []

    for sent in sentences:
        # Sent is a paragraph title
        if sent[:1] == '=':
            pt = '/'.join(paragraph_title)
            pt = pt.replace(',', '')
            paragraphs[pt] = this_sentences
            this_sentences = []
            # Level of paragraph
            level = max(len(s) for s in re.findall(r'=+', sent))
            this_title = sent[level:len(sent)-level].strip().replace(' ', '_')
            if level > len(paragraph_title):
                paragraph_title.append(this_title)
            elif level < len(paragraph_title):
                for i in range(len(paragraph_title)-level):
                    paragraph_title.pop()
                paragraph_title[level-1] = this_title
            else:
                paragraph_title[level-1] = this_title
        else:
            this_sentences.append(sent)
    pt = '/'.join(paragraph_title)
    pt = pt.replace(',', '')
    paragraphs[pt] = this_sentences

    if lemmatize:
        result = {k: [utils.lemmatize(s) for s in v if len(utils.lemmatize(s)) >= 2] for k, v in paragraphs.items() if len(v) >= 0}
    else:
        result = {k: [word_tokenize(s) for s in v if len(word_tokenize(s)) >= 2] for k, v in paragraphs.items() if len(v) >= 0}
    return categories, result, title, pageid
def lemmatizeCorpus(document, isListOfDocs=False):
    if isListOfDocs:
        docs = []
        for doc in document:
            _lemmitizedTokens = lemmatize(doc)
            docs.append([
                token.decode("utf-8").split("/")[0]
                for token in _lemmitizedTokens
            ])
        return docs
    else:
        _lemmitizedTokens = lemmatize(document)
        return [
            token.decode("utf-8").split("/")[0] for token in _lemmitizedTokens
        ]
Example #6
0
 def __init__(self,
              searchPhrase,
              dbname='TwitterDB',
              host='localhost',
              port=27017,
              query=None,
              k=0):
     self.queries = Queries(dbname=dbname, host=host, port=port)
     self.words = [
         word.split('/')[0] for word in lemmatize(
             cleanText.removeStopWords(
                 cleanText.cleanText(searchPhrase)[0]))
     ]
     self.idfs = dict()
     and_list = []
     if self.words:
         for word in self.words:
             and_list.append({'words.word': word})
         self.query_search = {"$and": and_list}
         if query:
             self.existing = True
             self.query_search.update(query)
         else:
             self.existing = False
         self.k = k
Example #7
0
def proc_ent(ent):
    ent = ent.lower().replace('.', ' ').replace('-', ' ').strip().replace(
        '_', ' ').replace('|', ' ').strip()
    ent = ' '.join(
        [tok.decode('utf-8').split('/')[0] for tok in lemmatize(ent)])
    # ent = ' '.join(list( set(ent.split()) - set(config.stpwords)))
    return ent
    def preprocess_data(cls):
        """
            It will process te ground data on which we are going to test and return te result.
        """
        preprocessed_description, preprocessed_speciality = [], []

        for _, sentence in enumerate(CURO_DATA["Description"].values):
            # We want those words here which is free from contraction so that not loose meaning
            sentence = CURO().contraction(str(sentence))
            # Eliminate those words which are with numbers
            sentence = re.sub(r"\S*\d\S*", "", sentence).strip()
            # Eliminate all numerics and special characters
            sentence = re.sub('[^A-Za-z]+', " ", sentence)
            # Remove all stopwords from each sentence, convert to lowercase
            sentence = " ".join(e.lower() for e in str(sentence).split() if e.lower() \
                                not in STOPWORDS)
            # Lemmatize all words
            sentence = " ".join([word.decode('utf-8').split('/')[0] for word in \
                                lemmatize(sentence)])
            preprocessed_description.append(sentence.strip())

        for _, sentence in enumerate(CURO_DATA["Speciality"].values):
            # Eliminate all numerics and special characters
            sentence = sentence.replace("@#$", "") if not sentence.split("@#$")[1] \
                                                    else sentence.replace("@#$", " => ")
            # Remove all stopwords from each sentence, convert to lowercase
            sentence = " ".join(e.lower() for e in str(sentence).split())
            preprocessed_speciality.append(sentence.strip())

        CURO_DATA["Preprocessed_Description"] = preprocessed_description
        CURO_DATA["Preprocessed_Speciality"] = preprocessed_speciality
Example #9
0
def foodwordReplacedTokenizer(review):
    """
    Epand contractions, lemmatize, and replace food-related
    words with "foodword".
    """
    # Expand contractions
    words = []
    for word in review.split():
        word = word.lower()
        if word in contractions:
            word = contractions[word]
        words += [word]
    review = ' '.join(words)

    # Lemmatize from parts of speech
    tokens = []
    for lemma in utils.lemmatize(review):
        lemma, pos = lemma.split('/')
        tokens += [lemma]

    # Re-merge for more processing
    lemmatized_review = ' '.join(tokens)

    # Join not with words in front;
    formatted_lm_review = lemmatized_review.replace(' not ', ' not_')

    # Food word replacement
    words = []
    for word in formatted_lm_review.split():
        if 'noun.food' in [syn.lexname() for syn in wn.synsets(word)]:
            words += ['FOODWORD']
        else:
            words += [word]

    return ' '.join(words)
Example #10
0
def process_texts(bigram, texts):
    """
    Function to process texts. Following are the steps we take:
    
    1. Stopword Removal.
    2. Collocation detection.
    3. Lemmatization (not stem since stemming can reduce the interpretability).
    
    Parameters:
    ----------
    bigram-- bigram to analyze
    texts-- Tokenized texts.
    
    Returns:
    -------
    texts: Pre-processed tokenized texts.
    """
    
    # reg. expression tokenizer
    
    texts = [[word for word in line if word not in stops] for line in texts]
    texts = [bigram[line] for line in texts]
    texts = [[word.split('/')[0] for word in lemmatize(' '.join(line), allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts]

    return texts
Example #11
0
    def phrases(self, clean_text):
        all_lemmas = lemmatize(clean_text, stopwords=self.stopwords)
        curated_words = [str(word).split('/')[0] for word in all_lemmas]
        curated_text = ' '.join(curated_words)

        doc = textacy.Doc(curated_text, lang='en')

        all_phrases = []
        all_phrases += textacy.extract.ngrams(doc,
                                              2,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              3,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              4,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              5,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)

        phrases = [str(phrase) for phrase in all_phrases]

        return phrases
Example #12
0
def preprocess_text(tweet):
    """
    Function to process an aggregated user profile. This does the following:
    1. Decode html entities. eg. "AT&amp;T" will become "AT&T"
    2. Deaccent
    3. Remove links.
    4. Remove any user mentions (@name).
    5. Lemmatize and remove stopwords.
    
    Parameters:
    ----------
    text : String. If train_texts is a list of tweets, ' '.join and pass
    
    Returns:
    -------
    text : preprocessed (tokenized) tweet.
    """
    tweet = decode_htmlentities(tweet)
    tweet = deaccent(tweet)
    tweet = tweet.encode('ascii',
                         'ignore')  # To prevent UnicodeDecodeErrors later on
    tweet = re.sub(r'http\S+', '', str(tweet))  # Step 3
    tweet = re.sub(r'@\w+', '', str(tweet))  # Step 4
    tweet = tweet.split()
    tweet = lemmatize(' '.join(tweet),
                      re.compile('(NN)'),
                      stopwords=stopwords.words('english'),
                      min_length=3,
                      max_length=15)
    tweet = [word.split('/')[0] for word in tweet]
    return tweet
Example #13
0
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
                    token_max_len=TOKEN_MAX_LEN, lower=True):
    """Parse a Wikipedia article, extract all tokens.
    Notes
    -----
    Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages
    like Japanese or Thai to perform better tokenization.
    The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool).
    Parameters
    ----------
    args : (str, bool, str, int)
        Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title,
        page identificator.
    tokenizer_func : function
        Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
        Needs to have interface:
        tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
    token_min_len : int
        Minimal token length.
    token_max_len : int
        Maximal token length.
    lower : bool
         Convert article text to lower case?
    Returns
    -------
    (list of str, str, int)
        List of tokens from article, title and page id.
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenizer_func(text, token_min_len, token_max_len, lower)
    return result, title, pageid
def preprocess_text(lemma, document):
    with open(document, 'r') as infile:
        # transform document into one string
        text = ' '.join(line.rstrip('\n') for line in infile)
    # convert string into unicode
    text = gensim.utils.any2unicode(text)

    # remove URL's
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '',
                  text)

    # remove symbols excluding the @, # and \s symbol
    text = re.sub(r'[^\w@#\s]', '', text)

    if lemma:
        return utils.lemmatize(text, stopwords=ignore_words, min_length=3)

    # tokenize words using NLTK Twitter Tokenizer
    tknzr = TweetTokenizer()
    text = tknzr.tokenize(text)

    # lowercase, remove words less than len 2 & remove numbers in tokenized list
    text = [
        word.lower() for word in text if len(word) > 2 and not word.isdigit()
    ]

    # remove stopwords
    return [word for word in text if not word in ignore_words]
Example #15
0
def lemmaSentence1(i, curSentence):
    x = lemmatize(curSentence)
    x = set([y.decode('utf-8').split('/')[0] for y in x])
    x = [str(y).lower() for y in x if len(y) > 2]
    #print("Completed")
    print("Completed for i {0}".format(i))
    return (TaggedDocument(words=x, tags=[str(i)]))
Example #16
0
def extract_user(user):
    with open('../data/' + user + '/interesting_articles.txt') as stalk_f:
        articles = filter(lambda x: x != '',
                stalk_f.read().rstrip('\n').split(' '))

    tastes = numpy.array([0.0 for i in range(lda.num_topics)])
    total = 0.0
    having = 0
    not_having = 0

    for article in articles:
        #print article
        try:
            text = open('../data/' + article + '.txt').read()
            having += 1
        except IOError: # we don't have this article
            not_having += 1
            continue
        if LEMMATIZE:
            a = utils.lemmatize(text)
        else:
            print >> sys.stderr, "ERROR: install pattern"
            sys.exit(-1)
        for topicid, proba in lda[lda.id2word.doc2bow(a)]:
            total += proba
            tastes[topicid] += proba

    tastes /= total

    of = open(user+'.params', 'w')
    pickle.dump(tastes.tolist(), of)

    print "For user:"******" we had:", having, "and missed:", not_having, "->", having*100.0/(having+not_having+0.000001), "%"
Example #17
0
 def __init__(self, searchPhrase, dbname='TwitterDB', query=False, k=0):
     client = pymongo.MongoClient()
     self.db = client[dbname]
     self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))]
     self.listSearch = {}
     self.query = query
     self.k = k
Example #18
0
    def get_texts(self):
        """
        Iterate over the HN articles returning text
        """
        positions, hn_articles = 0, 0

        # ************ HN articles ************
        fnamelist = []
        for g in glob.iglob(self.hn_folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist):
            hn_text = open(fname).read()
            hn_articles += 1
            if LEMMATIZE:
                result = utils.lemmatize(hn_text)
                positions += len(result)
                yield result
            else:
                result = tokenize(hn_text) # text into tokens here
                positions += len(result)
                yield result

        print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions))

        self.length = hn_articles # cache corpus length
 def _pos_tokenize_document(self, doc):
     tokens = simple_preprocess(doc)
     # lemmatizes, POS tags and remove stopwords (including empty strings) from the tokens list for stories
     pos_tokens = [
         lemmatize(t) for t in tokens if t not in STOPWORDS and len(t) > 0
     ]
     # flatten the list-of-lists of POS tokens created by previous operation and return
     return [word for inner_list in pos_tokens for word in inner_list]
Example #20
0
def process_article(args):
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid
    def __init__(self, document):
        """
        :param document: A string with the content of the document.
        """

        # use pattern lemmatizer. see gensim.utils.lemmatizer.
        #Note: len(words) < 15 are filtered out
        self.clean_document_ = utils.lemmatize(document)
Example #22
0
def clean(text):
    text = strip_multiple_whitespaces(strip_non_alphanum(text)).split()
    words = []
    for word in text:
        tmp = lemmatize(word)
        if tmp:
            words.append(tmp[0][:-3].decode("utf-8"))
    return " ".join(words)
Example #23
0
def parse(text):
    def tokenize(text):
        return [token.encode('utf8') for token in utils.tokenize(text, lower=True, errors='ignore') if 2 <= len(token) <= 20 and not token.startswith('_')]
    global LEMMATIZE
    if LEMMATIZE:
        return utils.lemmatize(text)
    else:
        return tokenize(text)
def process_article(args):
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid
Example #25
0
 def __init__(self, searchPhrase, k=0):
     self.words = [
         word.split('/')[0] for word in lemmatize(
             cleanText.removeStopWords(
                 cleanText.cleanText(searchPhrase)[0]))
     ]
     self.listSearch = {}
     self.k = k
Example #26
0
def tokenize(post):
    for currPunct in punctuations:
        post = post.replace(currPunct, "")
    if bool(emoji.get_emoji_regexp().search(post)):
        post = emoji.demojize(post)
    tokens = lemmatize(post)
    tokens = [str(x).split("/")[0].split('\'')[1] for x in tokens]
    tokens = [item for item in tokens if not item in stop and item not in add_stop]
    return tokens
Example #27
0
def process_article(args):
    # override original method in wikicorpus.py
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid
    def _phrases_in_raw_text_via_lemmatisation(self, raw_text):
        """
        Builds a list of lemmas from raw text using lemmatization.
        """
        all_lemmas = lemmatize(raw_text, allowed_tags=re.compile('(NN|JJ)'), stopwords=STOPWORDS_UNICODE)
        document_bigrams = self.fetch_document_bigrams(all_lemmas)
        known_bigrams = [bigram for bigram in document_bigrams if bigram in self.top_bigrams]

        return (all_lemmas + known_bigrams)
Example #29
0
def gensimlemm(texts):
    texts_out = []
    for sent in texts:
        doc = " ".join(sent)
        # print(doc)
        if len(doc) > 0:
            lemmatized_out = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(doc) if wd.decode('utf-8').split('/')[1]=='NN']
            texts_out.append(lemmatized_out)
    return texts_out
Example #30
0
def clean_feedback(row):
    tokenizer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter()
    stemmer = StemFilter()
    combined = row['Feedback']
    lemmList = [word.decode('utf-8').split('/')[0] for word in lemmatize(combined)]
    tokenWords = [token.text for token in tokenizer(combined)]
    stemWords = [stemmer.stemfn(word) for word in tokenWords]
    final = tokenWords + lemmList + stemWords
    return ' '.join(set(final))  # Join by space so it is easy for RegexTokenizer to manage
Example #31
0
def GetNounsFromDefinition(definition=str()):
    nouns_ = []
    lemma_ = lemmatize(definition)
    for word in lemma_:
        word_pos_ = word.split('/')
        if word_pos_[1][0] in ['N', 'R', 'J']:
            nouns_.append(word_pos_[0])

    return nouns_
Example #32
0
    def __init__(self, text):
        """
            :param text: content of document
            :type text: string
        """

        # d = {<t1, w1>, ... <tm, wm>}
        self.terms_quantity = Counter(
            lemma for lemma in lemmatize(text) if lemma[:-3] not in STOPWORDS
        )
Example #33
0
def gensimTest(text):
    print 'gensim'
    start = time()
    lemmas = lemmatize(text)
    for lemma in lemmas:
        lemma = lemma.split('/')
        print lemma[0], lemma[1]
    end = time()
    print 'gensim time:', (end-start)
    print "********************************"
Example #34
0
def posNN(text):
    tokens = []
    for word in lemmatize(text):
        st = word.decode("utf-8").split("/")
        #print(st)
        if st[1] == 'NN' or st[1] == 'VB':
            tokens.append(st[0])
    stop = open("stop.txt", "r").read().split("\n")
    filtered_tokens = [token for token in tokens if token not in stop]
    return " ".join(filtered_tokens)
Example #35
0
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary
def english_lemmatizer(text):
    """ calls the "pattern" module lemmatizer through utils """
    result = utils.lemmatize(text)
    if ONLY_NOUN_VERBS:
        result = filter(lambda x: x.split('/')[-1] == 'VB' or x.split('/')[-1] == 'NN', result)
    if ONLY_NOUNS:
        result = filter(lambda x: x.split('/')[-1] == 'NN', result)
    if DEBUG:
        print text
        print result
    return result
Example #37
0
 def __init__(self, searchPhrase, dbname='TwitterDB', query=False, k=0):
     client = pymongo.MongoClient()
     self.db = client[dbname]
     self.words = [
         word.split('/')[0] for word in lemmatize(
             cleanText.removeStopWords(
                 cleanText.cleanText(searchPhrase)[0]))
     ]
     self.listSearch = {}
     self.query = query
     self.k = k
def process_file_path(file_path):
    with open(file_path, "r") as file:
        # last character is a breaking /n
        article_name = file.readline()[:-1]

        #remaining lines is doc
        doc = " ".join(file.readlines())

        lemmatized_doc = utils.lemmatize(doc)

        return article_name, lemmatized_doc
Example #39
0
def get_summary(news_link = "http://english.onlinekhabar.com/will-try-to-endorse-medical-education-bill-on-friday-says-speaker.html"):
    # Getting news content
    news_source = urllib.request.urlopen(news_link).read()
    news_soup = bs.BeautifulSoup(news_source,'lxml')
    news_content = news_soup.find_all('div', class_ = 'oke-content-wrap clearfix')
    news_portion = news_content[0].find_all('p')
    news_para = [n.text for n in news_portion]
    news_para = ' '.join(news_para)
    news = news_para.split('\n\t')[0]

    # Get sentences
    news = news.split('\n')
    news = ' '.join(news)
    sentence_tk = sent_tokenize(news)
    print(sentence_tk)

    # Lemmatizing sentences (finding root word)
    tokenized = []
    i = 1

    for sentence in sentence_tk:
        print(i)
        lemmatized_out = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(sentence)]
        lemmatized_out = ' '.join(lemmatized_out)
        tokenized.append(lemmatized_out)
        i = i + 1

    print(tokenized)
    print('\n\n')

    #News sentences clustering
    clustering_data = []
    for token in tokenized:
        vec = model.infer_vector(token)
        clustering_data.append(vec)

    data_length = len(clustering_data)
    n_clusters = int(np.floor(data_length/3))
    kmeans = KMeans(n_clusters=n_clusters, n_init = 1)
    kmeans = kmeans.fit(clustering_data)

    #Getting representative sentences
    avg = []
    for j in range(n_clusters):
       idx = np.where(kmeans.labels_ == j)[0]
       avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, clustering_data)
    ordering = sorted(range(n_clusters), key=lambda k: avg[k])
    summary = ' '.join([sentence_tk[closest[idx]] for idx in ordering])
    #print(summary + '\n\n')
    #print('Length of original text: ',len(sentence_tk))
    #print('Length of summary: ',len(sent_tokenize(summary)))
    return summary
Example #40
0
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result
Example #41
0
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid
Example #42
0
def parse(text):
    def tokenize(text):
        return [
            token.encode('utf8')
            for token in utils.tokenize(text, lower=True, errors='ignore')
            if 2 <= len(token) <= 20 and not token.startswith('_')
        ]

    global LEMMATIZE
    if LEMMATIZE:
        return utils.lemmatize(text)
    else:
        return tokenize(text)
Example #43
0
def process_post(args):
    """Normalize an entry into tokens"""
    content, lemmatize, subject, pageid = args
    text = url_re.sub('', subject + " " + content)
    
    if lemmatize:
        result = utils.lemmatize(text)
    else: 
        result = [token.encode('utf8') for token in
            utils.tokenize(text, lower=True, errors='ignore')
            if 2 <= len(token) <= 15 and not token.startswith('_')
        ]

    return result, subject, pageid
Example #44
0
 def __init__(self, searchPhrase, dbname='TwitterDB', query=None, k=0):
     self.queries = Queries(dbname)
     self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))]
     self.idfs = dict()
     and_list = []
     for word in self.words:
         and_list.append({'words.word': word})
     self.query_search = {"$and" : and_list}
     if query:
         self.existing = True
         self.query_search.update(query)
     else:
         self.existing = False
     self.k = k
Example #45
0
 def get_features(self, document):
     #create list of tokens from doc
     logger.debug("Lemmatize document.")
     tokens = utils.lemmatize(document)
     
     #create bow of doc from token list
     logger.debug("Create bag-of-words representation from article.")
     doc_bow = self.dictionary.doc2bow(tokens)
     
     #create tfidf representation from bag-of-words
     logger.debug("Transform to tfidf.")
     doc_tfidf = self.tfidf_model[doc_bow]
     
     return doc_tfidf
Example #46
0
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).

    Set `tokenizer_func` (defaults to `tokenize`) parameter for languages like japanese or thai to perform better
    tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower).
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenizer_func(text, token_min_len, token_max_len, lower)
    return result, title, pageid
Example #47
0
def get_trans(line, sid, nitems=None, lemma=True, metadata=True, sw=stopwords.words("english")):
    # logger.info("get_trans")
    if lemma:
        # 	logger.debug("lemma")
        trans = utils.lemmatize(line, stopwords=sw)
    else:
        # 	logger.debug("no lemma")
        trans = utils.tokenize(line.replace(".", ""), lowercase=True)
        trans = " ".join([x.lower() for x in trans])

        # if trunc:
        # 	trans = " ".join(trans.split()[:trunc])
        # print "sw:", sw, "TRANS:", trans
    if metadata:
        return trans, (nitems, sid)
    else:
        return trans
Example #48
0
    def get_texts(self):
        """
        Iterate over the Wikipedia dump and the HN articles returning text
        """
        wiki_articles, hn_articles, articles_all = 0, 0, 0
        positions, positions_all = 0, 0

        # ************ Wikipedia ************
        texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file)))
        pool = multiprocessing.Pool(self.processes)
        for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory
            for tokens in pool.imap(wikicorpus.process_article, group):
                articles_all += 1
                positions_all += len(tokens)
                if len(tokens) > WIKI_ARTICLE_MIN_WORDS:
                    wiki_articles += 1
                    positions += len(tokens)
                    yield tokens
        pool.terminate()

        print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS))

        # ************ HN articles ************
        positions_after_wiki = positions
        fnamelist = []
        for g in glob.iglob(self.hn_folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki
            hn_text = open(fname).read()
            if self.lemmatize:
                result = utils.lemmatize(hn_text) # text into lemmas here
            else:
                result = tokenize(hn_text) # text into tokens here
            articles_all += 1
            positions_all += len(result)
            if len(result) > HN_ARTICLE_MIN_WORDS:
                hn_articles += 1
                positions += len(result)
                yield result

        print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki))
        # ************ /HN articles ************

        self.length = wiki_articles + hn_articles # cache corpus length
Example #49
0
def bowCorpus(root_path):
    vocab = corpora.dictionary.Dictionary()
    corpus = []
    filenames = [os.path.join(root_path, f) for f in os.listdir(root_path)]

    print colored(len(filenames), "green"), "files found in", colored(root_path, "green")

    print "Converting each file into bag-of-word:"
    for fname in pbar(filenames):
        with open(fname, "r") as f:
            content = f.read()

        tokens = utils.lemmatize(content)
        # lemmatize return strings like 'moderate/VB' or 'listing/NN'
        tokens = [x.split("/")[0] for x in tokens]
        bow = vocab.doc2bow(tokens, allow_update=True)
        corpus.append(bow)

    return corpus, vocab
Example #50
0
def get_trans(line, sid, nitems=None, lemma=True, metadata=True, sw=stopwords.words("english"), tokens_only=False):
    if lemma:
        trans = utils.lemmatize(line, stopwords=sw)
    else:
        trans = utils.tokenize(line.replace(".", ""), lowercase=True)

        if tokens_only:
            trans = [x.lower() for x in trans]
        else:
            try:
                trans = " ".join([x.lower() for x in trans])

            except:
                logger.error("** get_trans **")
                logger.error(repr(line))
                logger.error(repr(trans))
    if metadata:
        return trans, (nitems, sid)
    else:
        return trans
Example #51
0
    def get_texts(self):
        """
        Files are processed parallel.
        
        See wikicorpus.py by Radim Rehurek
        """
        logger = logging.getLogger("feature_extractor")

        processed_articles = 0
        for document in self.corpus:
            if processed_articles % 1000 == 0:
                logger.info("Processing article #%d..." % processed_articles)

            processed_articles += 1

            try:
                tokens = utils.lemmatize(document)
                yield tokens
            except Exception as e:
                logger.error("Could not process article: %s" % e)

        logger.info("Processed %d articles." % processed_articles)
Example #52
0
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
                    token_max_len=TOKEN_MAX_LEN, lower=True):
    """Parse a wikipedia article, extract all tokens.

    Notes
    -----
    Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages
    like japanese or thai to perform better tokenization.
    The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool).

    Parameters
    ----------
    args : (str, bool, str, int)
        Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title,
        page identificator.
    tokenizer_func : function
        Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
        Needs to have interface:
        tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
    token_min_len : int
        Minimal token length.
    token_max_len : int
        Maximal token length.
    lower : bool
         If True - convert article text to lower case.

    Returns
    -------
    (list of str, str, int)
        List of tokens from article, title and page id.

    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenizer_func(text, token_min_len, token_max_len, lower)
    return result, title, pageid
 def get_texts(self):
     '''
     Files are processed parallel.
     
     See wikicorpus.py by Radim Rehurek
     '''
     logger = logging.getLogger("feature_extractor")
     
     processed_articles = 0
     for article in  Article.objects():
         if processed_articles % 1000 == 0:
             logger.info("Processing article #%d..." % processed_articles)
             
         processed_articles += 1
         
         try:
             doc = article.clean_content
             tokens = utils.lemmatize(doc)
             yield tokens
         except Exception as e:
             logger.error("Could not process article %s (%s): %s" %
                          (article.id, type(e), e))
     
     logger.info("Processed %d articles." % processed_articles)
Example #54
0
import pickle, sys
from gensim import utils

#article_to_score = '../data/paulgraham.com-startupideas.html.txt'
#article_to_score = '../data/paulgraham.com-founder.html.txt'
article_to_score = '../data/paulgraham.com-ycombinator.html.txt'
text = open(article_to_score, 'r').read()

LEMMATIZE = utils.HAS_PATTERN
lda = None
if LEMMATIZE:
    f = open('/Users/gabrielsynnaeve/Dropbox/Public/hn_lemmatized.ldamodel', 'r')
    lda = pickle.load(f)
    a = utils.lemmatize(text)
else:
    print >> sys.stderr, "ERROR: install pattern"
    sys.exit(-1)

user = '******'
if len(sys.argv) > 1:
    user = sys.argv[1]

user_params = None
with open(user + '.params') as f:
    user_params = pickle.load(f)

# score \proto P(Like) 
# P(Like=true) \propto \sum_{t \in Topics}[P(TopicsArticle)
#                 * P(\lambda|t,TopicsArticle) * P(t|Like=true) * P(Like=true)]
score = 0.0
for topicid, proba in lda[lda.id2word.doc2bow(a)]:
Example #55
0
        best10 = bests[topicid][:10]
        beststrl = [(topic[i], ldaobject.id2word[i]) for i in best10]
        beststr = " + ".join(["%.3f*%s" % v for v in beststrl])
        if LEMMATIZE:
            print "topic #", topicid, " described by word:", topicnames[topicid].split("/")[0]
        else:
            print "topic #", topicid, " described by word:", topicnames[topicid]
        print beststr


f = None
if LEMMATIZE:
    f = open("hn_lemmatized.ldamodel", "r")
else:
    f = open("hn.ldamodel", "r")
lda = pickle.load(f)
topic_names(lda)

article = open("/Users/gabrielsynnaeve/labs/clojure/hackernews/data/99985.txt", "r").read()

a = None
if LEMMATIZE:
    a = utils.lemmatize(article)
else:
    a = tokenize(article)
print a

for topic, proba in lda[lda.id2word.doc2bow(a)]:
    print lda.show_topic(topic)
    print proba
Example #56
0
         doc = " ".join(file.readlines())
 except Exception as e:
     logger.error("Could not load document from %s" % options.text)
     sys.exit(1)
     
 #load dictionary, tfidf model, lda model, esa model
 logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" 
             % options.prefix)
 dictionary = corpora.Dictionary.load(options.prefix + "_wordids.dict")
 tfidf_model = models.TfidfModel.load(options.prefix + "_tfidf.model")
 lda_model = models.LdaModel.load(options.prefix + "_lda.model")
 esa_model = EsaModel.load(options.prefix + "_esa_on_lda.model")
 
 #create list of tokens from doc
 logger.info("Lemmatize document.")
 tokens = utils.lemmatize(doc)
 
 #create bow of doc from token list
 logger.info("Create bag-of-words representation from document.")
 doc_bow = dictionary.doc2bow(tokens)
 
 #create tfidf representation from bag-of-words
 logger.info("Transform to tfidf.")
 doc_tfidf = tfidf_model[doc_bow]
 
 #create lda representation from tfidf
 logger.info("Transform to lda")
 doc_lda = lda_model[doc_tfidf]
 
 #create esa representation from lda
 logger.info("Transform to esa")
from gensim.utils import lemmatize

x = lemmatize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!')
print(x)
from gensim.models import Word2Vec
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
from collections import Counter

print("Reading input file 'input/audits_with_content.csv'")
with open('input/audits_with_content.csv', 'r') as f:
    reader = csv.reader(f)
    raw_documents = list(reader)

print("Prepare documents")
documents = [doc[2] for doc in raw_documents if doc[2] != '']
sentences = []
bigram = Phrases()

for document in documents:
    raw_text = document.lower()
    tokens = lemmatize(raw_text, stopwords=STOPWORDS)
    sentences.append(tokens)
    bigram.add_vocab([tokens])

bigram_counter = Counter()
for key in bigram.vocab.keys():
    if key not in stopwords.words("english"):
        if len(key.split("_")) > 1:
            bigram_counter[key] += bigram.vocab[key]

for key, counts in bigram_counter.most_common(200):
    print '{0: <20} {1}'.format(key.encode("utf-8"), counts)
    froms = []
    dates = []
    for index, document in documents.items():
        count += 1
        if count > max_doc:
            break

        print '\r', count, '/', doc_num,
        text = document['text'] + (' ' + index) * title_weight  # incorporate title information
        from_name = document['from']
        date = document['date']

        cleaned = clean_text(text)  # delete irrelevant characters

        document = []
        tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos)  # lemmatize
        for token in tokens:
            word, pos = token.split('/')
            document.append(word)

        # convert compound word into one token
        document = convert_compound(document)

        # filter stop words, long words, and non-english words
        document = [w for w in document if not w in stop_words and 2 <= len(w) <= 15 and w.islower()]

        new_documents.append(document)
        titles.append(index)
        froms.append(from_name)
        dates.append(date)