コード例 #1
0
 def __iter__(self):
     #for line in open('ebola-raw.txt'):
     for line in open('testdata01.txt'):
         #for line in open('twitter2Mb.txt'):
         line = re.sub('<[^>]+>', '', line)
         utils.lemmatize(line)
         # assume there's one document per line, tokens separated by whitespace
         yield dictionary.doc2bow(line.lower().split())
コード例 #2
0
ファイル: utils.py プロジェクト: paopow/word_similarity_api
def lemmatize_an_idea(idea, use_stoplist=True):
    if idea in lemma_dict:
        return lemma_dict[idea]
    if use_stoplist:
        lemm = [lem[:-3] for lem in lemmatize(idea) if lem[:-3] not in stoplist]
    else:
        lemm = [lem[:-3] for lem in lemmatize(idea) if lem[:-3]]
    lemma_dict[idea] = lemm
    return lemm
コード例 #3
0
ファイル: utils.py プロジェクト: paopow/word_similarity_api
def lemmatize_an_idea(idea, use_stoplist=True):
    if idea in lemma_dict:
        return lemma_dict[idea]
    if use_stoplist:
        lemm = [
            lem[:-3] for lem in lemmatize(idea) if lem[:-3] not in stoplist
        ]
    else:
        lemm = [lem[:-3] for lem in lemmatize(idea) if lem[:-3]]
    lemma_dict[idea] = lemm
    return lemm
コード例 #4
0
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize, title, pageid = args
    categories = get_categories(text)
    if not list(set(categories).intersection(input_categories)):
        return None, None, None, None
    text = filter_wiki(text)
    sentences = sentence_tokenize(text)
    title = title.replace(' ', '_')
    paragraphs = {}

    # Split document into paragraphs
    # sentences = [s0, s1, t0, s2, t1, ...]
    paragraph_title = [title]
    level = 1
    this_sentences = []

    for sent in sentences:
        # Sent is a paragraph title
        if sent[:1] == '=':
            pt = '/'.join(paragraph_title)
            pt = pt.replace(',', '')
            paragraphs[pt] = this_sentences
            this_sentences = []
            # Level of paragraph
            level = max(len(s) for s in re.findall(r'=+', sent))
            this_title = sent[level:len(sent)-level].strip().replace(' ', '_')
            if level > len(paragraph_title):
                paragraph_title.append(this_title)
            elif level < len(paragraph_title):
                for i in range(len(paragraph_title)-level):
                    paragraph_title.pop()
                paragraph_title[level-1] = this_title
            else:
                paragraph_title[level-1] = this_title
        else:
            this_sentences.append(sent)
    pt = '/'.join(paragraph_title)
    pt = pt.replace(',', '')
    paragraphs[pt] = this_sentences

    if lemmatize:
        result = {k: [utils.lemmatize(s) for s in v if len(utils.lemmatize(s)) >= 2] for k, v in paragraphs.items() if len(v) >= 0}
    else:
        result = {k: [word_tokenize(s) for s in v if len(word_tokenize(s)) >= 2] for k, v in paragraphs.items() if len(v) >= 0}
    return categories, result, title, pageid
コード例 #5
0
def lemmatizeCorpus(document, isListOfDocs=False):
    if isListOfDocs:
        docs = []
        for doc in document:
            _lemmitizedTokens = lemmatize(doc)
            docs.append([
                token.decode("utf-8").split("/")[0]
                for token in _lemmitizedTokens
            ])
        return docs
    else:
        _lemmitizedTokens = lemmatize(document)
        return [
            token.decode("utf-8").split("/")[0] for token in _lemmitizedTokens
        ]
コード例 #6
0
 def __init__(self,
              searchPhrase,
              dbname='TwitterDB',
              host='localhost',
              port=27017,
              query=None,
              k=0):
     self.queries = Queries(dbname=dbname, host=host, port=port)
     self.words = [
         word.split('/')[0] for word in lemmatize(
             cleanText.removeStopWords(
                 cleanText.cleanText(searchPhrase)[0]))
     ]
     self.idfs = dict()
     and_list = []
     if self.words:
         for word in self.words:
             and_list.append({'words.word': word})
         self.query_search = {"$and": and_list}
         if query:
             self.existing = True
             self.query_search.update(query)
         else:
             self.existing = False
         self.k = k
コード例 #7
0
ファイル: helper.py プロジェクト: wayne9qiu/cesi
def proc_ent(ent):
    ent = ent.lower().replace('.', ' ').replace('-', ' ').strip().replace(
        '_', ' ').replace('|', ' ').strip()
    ent = ' '.join(
        [tok.decode('utf-8').split('/')[0] for tok in lemmatize(ent)])
    # ent = ' '.join(list( set(ent.split()) - set(config.stpwords)))
    return ent
コード例 #8
0
    def preprocess_data(cls):
        """
            It will process te ground data on which we are going to test and return te result.
        """
        preprocessed_description, preprocessed_speciality = [], []

        for _, sentence in enumerate(CURO_DATA["Description"].values):
            # We want those words here which is free from contraction so that not loose meaning
            sentence = CURO().contraction(str(sentence))
            # Eliminate those words which are with numbers
            sentence = re.sub(r"\S*\d\S*", "", sentence).strip()
            # Eliminate all numerics and special characters
            sentence = re.sub('[^A-Za-z]+', " ", sentence)
            # Remove all stopwords from each sentence, convert to lowercase
            sentence = " ".join(e.lower() for e in str(sentence).split() if e.lower() \
                                not in STOPWORDS)
            # Lemmatize all words
            sentence = " ".join([word.decode('utf-8').split('/')[0] for word in \
                                lemmatize(sentence)])
            preprocessed_description.append(sentence.strip())

        for _, sentence in enumerate(CURO_DATA["Speciality"].values):
            # Eliminate all numerics and special characters
            sentence = sentence.replace("@#$", "") if not sentence.split("@#$")[1] \
                                                    else sentence.replace("@#$", " => ")
            # Remove all stopwords from each sentence, convert to lowercase
            sentence = " ".join(e.lower() for e in str(sentence).split())
            preprocessed_speciality.append(sentence.strip())

        CURO_DATA["Preprocessed_Description"] = preprocessed_description
        CURO_DATA["Preprocessed_Speciality"] = preprocessed_speciality
コード例 #9
0
def foodwordReplacedTokenizer(review):
    """
    Epand contractions, lemmatize, and replace food-related
    words with "foodword".
    """
    # Expand contractions
    words = []
    for word in review.split():
        word = word.lower()
        if word in contractions:
            word = contractions[word]
        words += [word]
    review = ' '.join(words)

    # Lemmatize from parts of speech
    tokens = []
    for lemma in utils.lemmatize(review):
        lemma, pos = lemma.split('/')
        tokens += [lemma]

    # Re-merge for more processing
    lemmatized_review = ' '.join(tokens)

    # Join not with words in front;
    formatted_lm_review = lemmatized_review.replace(' not ', ' not_')

    # Food word replacement
    words = []
    for word in formatted_lm_review.split():
        if 'noun.food' in [syn.lexname() for syn in wn.synsets(word)]:
            words += ['FOODWORD']
        else:
            words += [word]

    return ' '.join(words)
コード例 #10
0
def process_texts(bigram, texts):
    """
    Function to process texts. Following are the steps we take:
    
    1. Stopword Removal.
    2. Collocation detection.
    3. Lemmatization (not stem since stemming can reduce the interpretability).
    
    Parameters:
    ----------
    bigram-- bigram to analyze
    texts-- Tokenized texts.
    
    Returns:
    -------
    texts: Pre-processed tokenized texts.
    """
    
    # reg. expression tokenizer
    
    texts = [[word for word in line if word not in stops] for line in texts]
    texts = [bigram[line] for line in texts]
    texts = [[word.split('/')[0] for word in lemmatize(' '.join(line), allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts]

    return texts
コード例 #11
0
    def phrases(self, clean_text):
        all_lemmas = lemmatize(clean_text, stopwords=self.stopwords)
        curated_words = [str(word).split('/')[0] for word in all_lemmas]
        curated_text = ' '.join(curated_words)

        doc = textacy.Doc(curated_text, lang='en')

        all_phrases = []
        all_phrases += textacy.extract.ngrams(doc,
                                              2,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              3,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              4,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)
        all_phrases += textacy.extract.ngrams(doc,
                                              5,
                                              filter_stops=True,
                                              filter_punct=True,
                                              filter_nums=True)

        phrases = [str(phrase) for phrase in all_phrases]

        return phrases
コード例 #12
0
def preprocess_text(tweet):
    """
    Function to process an aggregated user profile. This does the following:
    1. Decode html entities. eg. "AT&amp;T" will become "AT&T"
    2. Deaccent
    3. Remove links.
    4. Remove any user mentions (@name).
    5. Lemmatize and remove stopwords.
    
    Parameters:
    ----------
    text : String. If train_texts is a list of tweets, ' '.join and pass
    
    Returns:
    -------
    text : preprocessed (tokenized) tweet.
    """
    tweet = decode_htmlentities(tweet)
    tweet = deaccent(tweet)
    tweet = tweet.encode('ascii',
                         'ignore')  # To prevent UnicodeDecodeErrors later on
    tweet = re.sub(r'http\S+', '', str(tweet))  # Step 3
    tweet = re.sub(r'@\w+', '', str(tweet))  # Step 4
    tweet = tweet.split()
    tweet = lemmatize(' '.join(tweet),
                      re.compile('(NN)'),
                      stopwords=stopwords.words('english'),
                      min_length=3,
                      max_length=15)
    tweet = [word.split('/')[0] for word in tweet]
    return tweet
コード例 #13
0
ファイル: wiki.py プロジェクト: gcisantos/MTPIC
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
                    token_max_len=TOKEN_MAX_LEN, lower=True):
    """Parse a Wikipedia article, extract all tokens.
    Notes
    -----
    Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages
    like Japanese or Thai to perform better tokenization.
    The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool).
    Parameters
    ----------
    args : (str, bool, str, int)
        Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title,
        page identificator.
    tokenizer_func : function
        Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
        Needs to have interface:
        tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
    token_min_len : int
        Minimal token length.
    token_max_len : int
        Maximal token length.
    lower : bool
         Convert article text to lower case?
    Returns
    -------
    (list of str, str, int)
        List of tokens from article, title and page id.
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenizer_func(text, token_min_len, token_max_len, lower)
    return result, title, pageid
コード例 #14
0
def preprocess_text(lemma, document):
    with open(document, 'r') as infile:
        # transform document into one string
        text = ' '.join(line.rstrip('\n') for line in infile)
    # convert string into unicode
    text = gensim.utils.any2unicode(text)

    # remove URL's
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '',
                  text)

    # remove symbols excluding the @, # and \s symbol
    text = re.sub(r'[^\w@#\s]', '', text)

    if lemma:
        return utils.lemmatize(text, stopwords=ignore_words, min_length=3)

    # tokenize words using NLTK Twitter Tokenizer
    tknzr = TweetTokenizer()
    text = tknzr.tokenize(text)

    # lowercase, remove words less than len 2 & remove numbers in tokenized list
    text = [
        word.lower() for word in text if len(word) > 2 and not word.isdigit()
    ]

    # remove stopwords
    return [word for word in text if not word in ignore_words]
コード例 #15
0
ファイル: genrePrediction.py プロジェクト: MahdiRag/python
def lemmaSentence1(i, curSentence):
    x = lemmatize(curSentence)
    x = set([y.decode('utf-8').split('/')[0] for y in x])
    x = [str(y).lower() for y in x if len(y) > 2]
    #print("Completed")
    print("Completed for i {0}".format(i))
    return (TaggedDocument(words=x, tags=[str(i)]))
コード例 #16
0
ファイル: learn_user_tastes.py プロジェクト: imclab/HN_stats
def extract_user(user):
    with open('../data/' + user + '/interesting_articles.txt') as stalk_f:
        articles = filter(lambda x: x != '',
                stalk_f.read().rstrip('\n').split(' '))

    tastes = numpy.array([0.0 for i in range(lda.num_topics)])
    total = 0.0
    having = 0
    not_having = 0

    for article in articles:
        #print article
        try:
            text = open('../data/' + article + '.txt').read()
            having += 1
        except IOError: # we don't have this article
            not_having += 1
            continue
        if LEMMATIZE:
            a = utils.lemmatize(text)
        else:
            print >> sys.stderr, "ERROR: install pattern"
            sys.exit(-1)
        for topicid, proba in lda[lda.id2word.doc2bow(a)]:
            total += proba
            tastes[topicid] += proba

    tastes /= total

    of = open(user+'.params', 'w')
    pickle.dump(tastes.tolist(), of)

    print "For user:"******" we had:", having, "and missed:", not_having, "->", having*100.0/(having+not_having+0.000001), "%"
コード例 #17
0
ファイル: search_mongo.py プロジェクト: cipriantruica/CATS
 def __init__(self, searchPhrase, dbname='TwitterDB', query=False, k=0):
     client = pymongo.MongoClient()
     self.db = client[dbname]
     self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))]
     self.listSearch = {}
     self.query = query
     self.k = k
コード例 #18
0
ファイル: hn.py プロジェクト: imclab/HN_stats
    def get_texts(self):
        """
        Iterate over the HN articles returning text
        """
        positions, hn_articles = 0, 0

        # ************ HN articles ************
        fnamelist = []
        for g in glob.iglob(self.hn_folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist):
            hn_text = open(fname).read()
            hn_articles += 1
            if LEMMATIZE:
                result = utils.lemmatize(hn_text)
                positions += len(result)
                yield result
            else:
                result = tokenize(hn_text) # text into tokens here
                positions += len(result)
                yield result

        print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions))

        self.length = hn_articles # cache corpus length
コード例 #19
0
 def _pos_tokenize_document(self, doc):
     tokens = simple_preprocess(doc)
     # lemmatizes, POS tags and remove stopwords (including empty strings) from the tokens list for stories
     pos_tokens = [
         lemmatize(t) for t in tokens if t not in STOPWORDS and len(t) > 0
     ]
     # flatten the list-of-lists of POS tokens created by previous operation and return
     return [word for inner_list in pos_tokens for word in inner_list]
コード例 #20
0
def process_article(args):
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid
コード例 #21
0
    def __init__(self, document):
        """
        :param document: A string with the content of the document.
        """

        # use pattern lemmatizer. see gensim.utils.lemmatizer.
        #Note: len(words) < 15 are filtered out
        self.clean_document_ = utils.lemmatize(document)
コード例 #22
0
def clean(text):
    text = strip_multiple_whitespaces(strip_non_alphanum(text)).split()
    words = []
    for word in text:
        tmp = lemmatize(word)
        if tmp:
            words.append(tmp[0][:-3].decode("utf-8"))
    return " ".join(words)
コード例 #23
0
ファイル: topics_tools.py プロジェクト: emillon/mosileno-web
def parse(text):
    def tokenize(text):
        return [token.encode('utf8') for token in utils.tokenize(text, lower=True, errors='ignore') if 2 <= len(token) <= 20 and not token.startswith('_')]
    global LEMMATIZE
    if LEMMATIZE:
        return utils.lemmatize(text)
    else:
        return tokenize(text)
コード例 #24
0
def process_article(args):
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid
コード例 #25
0
ファイル: search_mongo.py プロジェクト: nishara/TECTONIQ
 def __init__(self, searchPhrase, k=0):
     self.words = [
         word.split('/')[0] for word in lemmatize(
             cleanText.removeStopWords(
                 cleanText.cleanText(searchPhrase)[0]))
     ]
     self.listSearch = {}
     self.k = k
コード例 #26
0
def tokenize(post):
    for currPunct in punctuations:
        post = post.replace(currPunct, "")
    if bool(emoji.get_emoji_regexp().search(post)):
        post = emoji.demojize(post)
    tokens = lemmatize(post)
    tokens = [str(x).split("/")[0].split('\'')[1] for x in tokens]
    tokens = [item for item in tokens if not item in stop and item not in add_stop]
    return tokens
コード例 #27
0
def process_article(args):
    # override original method in wikicorpus.py
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid
    def _phrases_in_raw_text_via_lemmatisation(self, raw_text):
        """
        Builds a list of lemmas from raw text using lemmatization.
        """
        all_lemmas = lemmatize(raw_text, allowed_tags=re.compile('(NN|JJ)'), stopwords=STOPWORDS_UNICODE)
        document_bigrams = self.fetch_document_bigrams(all_lemmas)
        known_bigrams = [bigram for bigram in document_bigrams if bigram in self.top_bigrams]

        return (all_lemmas + known_bigrams)
コード例 #29
0
def gensimlemm(texts):
    texts_out = []
    for sent in texts:
        doc = " ".join(sent)
        # print(doc)
        if len(doc) > 0:
            lemmatized_out = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(doc) if wd.decode('utf-8').split('/')[1]=='NN']
            texts_out.append(lemmatized_out)
    return texts_out
コード例 #30
0
def clean_feedback(row):
    tokenizer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter()
    stemmer = StemFilter()
    combined = row['Feedback']
    lemmList = [word.decode('utf-8').split('/')[0] for word in lemmatize(combined)]
    tokenWords = [token.text for token in tokenizer(combined)]
    stemWords = [stemmer.stemfn(word) for word in tokenWords]
    final = tokenWords + lemmList + stemWords
    return ' '.join(set(final))  # Join by space so it is easy for RegexTokenizer to manage
コード例 #31
0
def GetNounsFromDefinition(definition=str()):
    nouns_ = []
    lemma_ = lemmatize(definition)
    for word in lemma_:
        word_pos_ = word.split('/')
        if word_pos_[1][0] in ['N', 'R', 'J']:
            nouns_.append(word_pos_[0])

    return nouns_
コード例 #32
0
ファイル: document.py プロジェクト: Nozdi/webpage-similarity
    def __init__(self, text):
        """
            :param text: content of document
            :type text: string
        """

        # d = {<t1, w1>, ... <tm, wm>}
        self.terms_quantity = Counter(
            lemma for lemma in lemmatize(text) if lemma[:-3] not in STOPWORDS
        )
コード例 #33
0
def gensimTest(text):
    print 'gensim'
    start = time()
    lemmas = lemmatize(text)
    for lemma in lemmas:
        lemma = lemma.split('/')
        print lemma[0], lemma[1]
    end = time()
    print 'gensim time:', (end-start)
    print "********************************"
コード例 #34
0
ファイル: views.py プロジェクト: arnab17/News-Aggregator
def posNN(text):
    tokens = []
    for word in lemmatize(text):
        st = word.decode("utf-8").split("/")
        #print(st)
        if st[1] == 'NN' or st[1] == 'VB':
            tokens.append(st[0])
    stop = open("stop.txt", "r").read().split("\n")
    filtered_tokens = [token for token in tokens if token not in stop]
    return " ".join(filtered_tokens)
コード例 #35
0
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary
コード例 #36
0
def english_lemmatizer(text):
    """ calls the "pattern" module lemmatizer through utils """
    result = utils.lemmatize(text)
    if ONLY_NOUN_VERBS:
        result = filter(lambda x: x.split('/')[-1] == 'VB' or x.split('/')[-1] == 'NN', result)
    if ONLY_NOUNS:
        result = filter(lambda x: x.split('/')[-1] == 'NN', result)
    if DEBUG:
        print text
        print result
    return result
コード例 #37
0
 def __init__(self, searchPhrase, dbname='TwitterDB', query=False, k=0):
     client = pymongo.MongoClient()
     self.db = client[dbname]
     self.words = [
         word.split('/')[0] for word in lemmatize(
             cleanText.removeStopWords(
                 cleanText.cleanText(searchPhrase)[0]))
     ]
     self.listSearch = {}
     self.query = query
     self.k = k
コード例 #38
0
def process_file_path(file_path):
    with open(file_path, "r") as file:
        # last character is a breaking /n
        article_name = file.readline()[:-1]

        #remaining lines is doc
        doc = " ".join(file.readlines())

        lemmatized_doc = utils.lemmatize(doc)

        return article_name, lemmatized_doc
コード例 #39
0
def get_summary(news_link = "http://english.onlinekhabar.com/will-try-to-endorse-medical-education-bill-on-friday-says-speaker.html"):
    # Getting news content
    news_source = urllib.request.urlopen(news_link).read()
    news_soup = bs.BeautifulSoup(news_source,'lxml')
    news_content = news_soup.find_all('div', class_ = 'oke-content-wrap clearfix')
    news_portion = news_content[0].find_all('p')
    news_para = [n.text for n in news_portion]
    news_para = ' '.join(news_para)
    news = news_para.split('\n\t')[0]

    # Get sentences
    news = news.split('\n')
    news = ' '.join(news)
    sentence_tk = sent_tokenize(news)
    print(sentence_tk)

    # Lemmatizing sentences (finding root word)
    tokenized = []
    i = 1

    for sentence in sentence_tk:
        print(i)
        lemmatized_out = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(sentence)]
        lemmatized_out = ' '.join(lemmatized_out)
        tokenized.append(lemmatized_out)
        i = i + 1

    print(tokenized)
    print('\n\n')

    #News sentences clustering
    clustering_data = []
    for token in tokenized:
        vec = model.infer_vector(token)
        clustering_data.append(vec)

    data_length = len(clustering_data)
    n_clusters = int(np.floor(data_length/3))
    kmeans = KMeans(n_clusters=n_clusters, n_init = 1)
    kmeans = kmeans.fit(clustering_data)

    #Getting representative sentences
    avg = []
    for j in range(n_clusters):
       idx = np.where(kmeans.labels_ == j)[0]
       avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, clustering_data)
    ordering = sorted(range(n_clusters), key=lambda k: avg[k])
    summary = ' '.join([sentence_tk[closest[idx]] for idx in ordering])
    #print(summary + '\n\n')
    #print('Length of original text: ',len(sentence_tk))
    #print('Length of summary: ',len(sent_tokenize(summary)))
    return summary
コード例 #40
0
ファイル: wikicorpus.py プロジェクト: nAk123/gensim
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result
コード例 #41
0
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid
コード例 #42
0
ファイル: topics_tools.py プロジェクト: emillon/mosileno-web
def parse(text):
    def tokenize(text):
        return [
            token.encode('utf8')
            for token in utils.tokenize(text, lower=True, errors='ignore')
            if 2 <= len(token) <= 20 and not token.startswith('_')
        ]

    global LEMMATIZE
    if LEMMATIZE:
        return utils.lemmatize(text)
    else:
        return tokenize(text)
コード例 #43
0
ファイル: buildmodel.py プロジェクト: ants/pglsi
def process_post(args):
    """Normalize an entry into tokens"""
    content, lemmatize, subject, pageid = args
    text = url_re.sub('', subject + " " + content)
    
    if lemmatize:
        result = utils.lemmatize(text)
    else: 
        result = [token.encode('utf8') for token in
            utils.tokenize(text, lower=True, errors='ignore')
            if 2 <= len(token) <= 15 and not token.startswith('_')
        ]

    return result, subject, pageid
コード例 #44
0
 def __init__(self, searchPhrase, dbname='TwitterDB', query=None, k=0):
     self.queries = Queries(dbname)
     self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))]
     self.idfs = dict()
     and_list = []
     for word in self.words:
         and_list.append({'words.word': word})
     self.query_search = {"$and" : and_list}
     if query:
         self.existing = True
         self.query_search.update(query)
     else:
         self.existing = False
     self.k = k
コード例 #45
0
ファイル: extractors.py プロジェクト: JOSMANC/nyan
 def get_features(self, document):
     #create list of tokens from doc
     logger.debug("Lemmatize document.")
     tokens = utils.lemmatize(document)
     
     #create bow of doc from token list
     logger.debug("Create bag-of-words representation from article.")
     doc_bow = self.dictionary.doc2bow(tokens)
     
     #create tfidf representation from bag-of-words
     logger.debug("Transform to tfidf.")
     doc_tfidf = self.tfidf_model[doc_bow]
     
     return doc_tfidf
コード例 #46
0
ファイル: wikicorpus.py プロジェクト: jMonteroMunoz/gensim
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).

    Set `tokenizer_func` (defaults to `tokenize`) parameter for languages like japanese or thai to perform better
    tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower).
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenizer_func(text, token_min_len, token_max_len, lower)
    return result, title, pageid
コード例 #47
0
ファイル: FisherCorpus.py プロジェクト: laic/discourse
def get_trans(line, sid, nitems=None, lemma=True, metadata=True, sw=stopwords.words("english")):
    # logger.info("get_trans")
    if lemma:
        # 	logger.debug("lemma")
        trans = utils.lemmatize(line, stopwords=sw)
    else:
        # 	logger.debug("no lemma")
        trans = utils.tokenize(line.replace(".", ""), lowercase=True)
        trans = " ".join([x.lower() for x in trans])

        # if trunc:
        # 	trans = " ".join(trans.split()[:trunc])
        # print "sw:", sw, "TRANS:", trans
    if metadata:
        return trans, (nitems, sid)
    else:
        return trans
コード例 #48
0
ファイル: wiki_and_hn.py プロジェクト: imclab/HN_stats
    def get_texts(self):
        """
        Iterate over the Wikipedia dump and the HN articles returning text
        """
        wiki_articles, hn_articles, articles_all = 0, 0, 0
        positions, positions_all = 0, 0

        # ************ Wikipedia ************
        texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file)))
        pool = multiprocessing.Pool(self.processes)
        for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory
            for tokens in pool.imap(wikicorpus.process_article, group):
                articles_all += 1
                positions_all += len(tokens)
                if len(tokens) > WIKI_ARTICLE_MIN_WORDS:
                    wiki_articles += 1
                    positions += len(tokens)
                    yield tokens
        pool.terminate()

        print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS))

        # ************ HN articles ************
        positions_after_wiki = positions
        fnamelist = []
        for g in glob.iglob(self.hn_folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki
            hn_text = open(fname).read()
            if self.lemmatize:
                result = utils.lemmatize(hn_text) # text into lemmas here
            else:
                result = tokenize(hn_text) # text into tokens here
            articles_all += 1
            positions_all += len(result)
            if len(result) > HN_ARTICLE_MIN_WORDS:
                hn_articles += 1
                positions += len(result)
                yield result

        print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki))
        # ************ /HN articles ************

        self.length = wiki_articles + hn_articles # cache corpus length
コード例 #49
0
ファイル: lda.py プロジェクト: qpleple/lda-play
def bowCorpus(root_path):
    vocab = corpora.dictionary.Dictionary()
    corpus = []
    filenames = [os.path.join(root_path, f) for f in os.listdir(root_path)]

    print colored(len(filenames), "green"), "files found in", colored(root_path, "green")

    print "Converting each file into bag-of-word:"
    for fname in pbar(filenames):
        with open(fname, "r") as f:
            content = f.read()

        tokens = utils.lemmatize(content)
        # lemmatize return strings like 'moderate/VB' or 'listing/NN'
        tokens = [x.split("/")[0] for x in tokens]
        bow = vocab.doc2bow(tokens, allow_update=True)
        corpus.append(bow)

    return corpus, vocab
コード例 #50
0
ファイル: TedCorpus.py プロジェクト: laic/discourse
def get_trans(line, sid, nitems=None, lemma=True, metadata=True, sw=stopwords.words("english"), tokens_only=False):
    if lemma:
        trans = utils.lemmatize(line, stopwords=sw)
    else:
        trans = utils.tokenize(line.replace(".", ""), lowercase=True)

        if tokens_only:
            trans = [x.lower() for x in trans]
        else:
            try:
                trans = " ".join([x.lower() for x in trans])

            except:
                logger.error("** get_trans **")
                logger.error(repr(line))
                logger.error(repr(trans))
    if metadata:
        return trans, (nitems, sid)
    else:
        return trans
コード例 #51
0
ファイル: learn_lda.py プロジェクト: nederhrj/nyan
    def get_texts(self):
        """
        Files are processed parallel.
        
        See wikicorpus.py by Radim Rehurek
        """
        logger = logging.getLogger("feature_extractor")

        processed_articles = 0
        for document in self.corpus:
            if processed_articles % 1000 == 0:
                logger.info("Processing article #%d..." % processed_articles)

            processed_articles += 1

            try:
                tokens = utils.lemmatize(document)
                yield tokens
            except Exception as e:
                logger.error("Could not process article: %s" % e)

        logger.info("Processed %d articles." % processed_articles)
コード例 #52
0
ファイル: wikicorpus.py プロジェクト: abs51295/gensim
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN,
                    token_max_len=TOKEN_MAX_LEN, lower=True):
    """Parse a wikipedia article, extract all tokens.

    Notes
    -----
    Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages
    like japanese or thai to perform better tokenization.
    The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool).

    Parameters
    ----------
    args : (str, bool, str, int)
        Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title,
        page identificator.
    tokenizer_func : function
        Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
        Needs to have interface:
        tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
    token_min_len : int
        Minimal token length.
    token_max_len : int
        Maximal token length.
    lower : bool
         If True - convert article text to lower case.

    Returns
    -------
    (list of str, str, int)
        List of tokens from article, title and page id.

    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenizer_func(text, token_min_len, token_max_len, lower)
    return result, title, pageid
コード例 #53
0
 def get_texts(self):
     '''
     Files are processed parallel.
     
     See wikicorpus.py by Radim Rehurek
     '''
     logger = logging.getLogger("feature_extractor")
     
     processed_articles = 0
     for article in  Article.objects():
         if processed_articles % 1000 == 0:
             logger.info("Processing article #%d..." % processed_articles)
             
         processed_articles += 1
         
         try:
             doc = article.clean_content
             tokens = utils.lemmatize(doc)
             yield tokens
         except Exception as e:
             logger.error("Could not process article %s (%s): %s" %
                          (article.id, type(e), e))
     
     logger.info("Processed %d articles." % processed_articles)
コード例 #54
0
ファイル: score_article.py プロジェクト: imclab/HN_stats
import pickle, sys
from gensim import utils

#article_to_score = '../data/paulgraham.com-startupideas.html.txt'
#article_to_score = '../data/paulgraham.com-founder.html.txt'
article_to_score = '../data/paulgraham.com-ycombinator.html.txt'
text = open(article_to_score, 'r').read()

LEMMATIZE = utils.HAS_PATTERN
lda = None
if LEMMATIZE:
    f = open('/Users/gabrielsynnaeve/Dropbox/Public/hn_lemmatized.ldamodel', 'r')
    lda = pickle.load(f)
    a = utils.lemmatize(text)
else:
    print >> sys.stderr, "ERROR: install pattern"
    sys.exit(-1)

user = '******'
if len(sys.argv) > 1:
    user = sys.argv[1]

user_params = None
with open(user + '.params') as f:
    user_params = pickle.load(f)

# score \proto P(Like) 
# P(Like=true) \propto \sum_{t \in Topics}[P(TopicsArticle)
#                 * P(\lambda|t,TopicsArticle) * P(t|Like=true) * P(Like=true)]
score = 0.0
for topicid, proba in lda[lda.id2word.doc2bow(a)]:
コード例 #55
0
ファイル: test_lda.py プロジェクト: imclab/HN_stats
        best10 = bests[topicid][:10]
        beststrl = [(topic[i], ldaobject.id2word[i]) for i in best10]
        beststr = " + ".join(["%.3f*%s" % v for v in beststrl])
        if LEMMATIZE:
            print "topic #", topicid, " described by word:", topicnames[topicid].split("/")[0]
        else:
            print "topic #", topicid, " described by word:", topicnames[topicid]
        print beststr


f = None
if LEMMATIZE:
    f = open("hn_lemmatized.ldamodel", "r")
else:
    f = open("hn.ldamodel", "r")
lda = pickle.load(f)
topic_names(lda)

article = open("/Users/gabrielsynnaeve/labs/clojure/hackernews/data/99985.txt", "r").read()

a = None
if LEMMATIZE:
    a = utils.lemmatize(article)
else:
    a = tokenize(article)
print a

for topic, proba in lda[lda.id2word.doc2bow(a)]:
    print lda.show_topic(topic)
    print proba
コード例 #56
0
ファイル: esa_sample.py プロジェクト: JOSMANC/nyan
         doc = " ".join(file.readlines())
 except Exception as e:
     logger.error("Could not load document from %s" % options.text)
     sys.exit(1)
     
 #load dictionary, tfidf model, lda model, esa model
 logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" 
             % options.prefix)
 dictionary = corpora.Dictionary.load(options.prefix + "_wordids.dict")
 tfidf_model = models.TfidfModel.load(options.prefix + "_tfidf.model")
 lda_model = models.LdaModel.load(options.prefix + "_lda.model")
 esa_model = EsaModel.load(options.prefix + "_esa_on_lda.model")
 
 #create list of tokens from doc
 logger.info("Lemmatize document.")
 tokens = utils.lemmatize(doc)
 
 #create bow of doc from token list
 logger.info("Create bag-of-words representation from document.")
 doc_bow = dictionary.doc2bow(tokens)
 
 #create tfidf representation from bag-of-words
 logger.info("Transform to tfidf.")
 doc_tfidf = tfidf_model[doc_bow]
 
 #create lda representation from tfidf
 logger.info("Transform to lda")
 doc_lda = lda_model[doc_tfidf]
 
 #create esa representation from lda
 logger.info("Transform to esa")
コード例 #57
0
from gensim.utils import lemmatize

x = lemmatize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!')
print(x)
コード例 #58
0
from gensim.models import Word2Vec
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
from collections import Counter

print("Reading input file 'input/audits_with_content.csv'")
with open('input/audits_with_content.csv', 'r') as f:
    reader = csv.reader(f)
    raw_documents = list(reader)

print("Prepare documents")
documents = [doc[2] for doc in raw_documents if doc[2] != '']
sentences = []
bigram = Phrases()

for document in documents:
    raw_text = document.lower()
    tokens = lemmatize(raw_text, stopwords=STOPWORDS)
    sentences.append(tokens)
    bigram.add_vocab([tokens])

bigram_counter = Counter()
for key in bigram.vocab.keys():
    if key not in stopwords.words("english"):
        if len(key.split("_")) > 1:
            bigram_counter[key] += bigram.vocab[key]

for key, counts in bigram_counter.most_common(200):
    print '{0: <20} {1}'.format(key.encode("utf-8"), counts)
コード例 #59
0
    froms = []
    dates = []
    for index, document in documents.items():
        count += 1
        if count > max_doc:
            break

        print '\r', count, '/', doc_num,
        text = document['text'] + (' ' + index) * title_weight  # incorporate title information
        from_name = document['from']
        date = document['date']

        cleaned = clean_text(text)  # delete irrelevant characters

        document = []
        tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos)  # lemmatize
        for token in tokens:
            word, pos = token.split('/')
            document.append(word)

        # convert compound word into one token
        document = convert_compound(document)

        # filter stop words, long words, and non-english words
        document = [w for w in document if not w in stop_words and 2 <= len(w) <= 15 and w.islower()]

        new_documents.append(document)
        titles.append(index)
        froms.append(from_name)
        dates.append(date)