コード例 #1
0
ファイル: predict_topics.py プロジェクト: dchecks/taggernews
def create_logistic_model(df, _story_ids, data):
    results = []
    lr_models = {}
    for label in set(labels):
        positive_story_ids = set(df >> sift(X["labels"] == label) >> X.story_id.values)
        y_ = np.array([s in positive_story_ids for s in _story_ids])
        X_ = data
        lr = linear_model.LogisticRegression(C=C_VALUE)
        logging.info(label, Counter(y_))

        cv_score = cross_validation.cross_val_score(
            lr, X_, y_, cv=10, scoring="roc_auc").mean()

        lr = lr.fit(X_, y_)
        lr_models[label] = lr
        probs = lr.predict_proba(X_)[:, 1]
        results.append({"alg": "log reg", "label": label, "auc": cv_score})
        logging.info(C_VALUE, label, cv_score, len(probs[probs > 0.19]), Counter(labels == label))
        logging.info()
    results_df = pd.DataFrame(results)

    lr_fname = make_time_filename(LOGISTIC_MODEL_NAME, ".pkl")
    logging.info("writing file", lr_fname)
    with open(lr_fname, "wb") as f:
        pickle.dump(lr_models, f, protocol=2)
コード例 #2
0
    def get_features(self, question, context):
        stop = set() if self.stop_words is None else self.stop_words.words
        context_features = np.zeros((len(context), 3))

        if not self.require_unique_match:
            question_words = set(x for x in question if x.lower() not in stop)
            quesiton_words_lower = set(x.lower() for x in question)
            quesiton_words_stem = set(
                self.lemmatize_word(x) for x in quesiton_words_lower)
        else:
            question_words = set(k for k, v in Counter(question).items()
                                 if v == 1)
            quesiton_words_lower = set(k for k, v in Counter(
                x.lower() for x in question_words).items() if v == 1)
            quesiton_words_stem = set(k for k, v in Counter(
                self.lemmatize_word(x) for x in quesiton_words_lower).items()
                                      if v == 1)

        for i, word in enumerate(context):
            if word in question_words:
                context_features[i][:3] = 1
            elif word.lower() in quesiton_words_lower:
                context_features[i][:2] = 1
            elif self._lemmatizer.lemmatize(word) in quesiton_words_stem:
                context_features[i][2] = 1

        if self.empty_question_features:
            return np.zeros((len(question), 3)), context_features
        else:
            return np.zeros((len(question), 0)), context_features
コード例 #3
0
ファイル: main.py プロジェクト: Shivanisingh05/TwitterBot
def llt(query):
    print(
        colored(
            "*****************************************************************************************"
            "********************************************************************************************",
            color='magenta'))
    public_tweets = get_tweets(query)
    location = {}
    language = {}
    time_zone = {}
    for tweet in public_tweets['statuses']:
        loc = tweet['user']['location']
        lang = tweet['user']['lang']
        tz = tweet['user']['time_zone']

        if loc in location:
            location[loc] += 1
        else:
            location[loc] = 1
        if lang in language:
            language[lang] += 1
        else:
            language[lang] = 1
        if tz in time_zone:
            time_zone[tz] += 1
        else:
            time_zone[tz] = 1
    if None in time_zone:
        del time_zone[None]
    if '' in time_zone:
        del time_zone['']
    if '' in language:
        del language['']
    if '' in location:
        del location['']
    if None in location:
        del location[None]
    if None in language:
        del language[None]
    language_count = dict(Counter(language).most_common(4))
    print(colored("language: ", color='green', attrs=['bold']))
    print language_count
    location_count = dict(Counter(location).most_common(4))
    print(colored("locations: ", color='green', attrs=['bold']))
    print location_count
    time_zone_count = dict(Counter(time_zone).most_common(4))
    print(colored("Time Zone: ", color='green', attrs=['bold']))
    print time_zone_count
    print(
        colored(
            "*****************************************************************************************"
            "********************************************************************************************",
            color='magenta'))
コード例 #4
0
def llt(query):

    public_tweets = get_tweets(query)

    global time_zone1, loca, lang
    location = {}
    language = {}
    time_zone = {}
    for tweet in public_tweets['statuses']:
        loca = tweet['user']['location']
        lang = tweet['user']['lang']
        time_zone1 = tweet['user']['created_at']
        if loca in location:
            location[loca] += 1
        else:
            location[loca] = 1
        if lang in language:
            language[lang] += 1
        else:
            language[lang] = 1
        if time_zone1 in time_zone:
            time_zone[time_zone1] += 1
        else:
            time_zone[time_zone1] = 1

    # limiting the display of the values
    if None in time_zone:
        del time_zone[None]
    if '' in time_zone:
        del time_zone['']
    if '' in language:
        del language['']
    if '' in location:
        del location['']
    if None in location:
        del location[None]
    if None in language:
        del language[None]

    language_count = dict(Counter(language).most_common(5))
    print(colored("Language: ", color='green', attrs=['bold']))
    print(language_count)

    location_count = dict(Counter(location).most_common(5))
    print(colored("Location: ", color='green', attrs=['bold']))
    print(location_count)
    time_zone_count = dict(Counter(time_zone).most_common(5))
    print(colored("Time Zone: ", color='green', attrs=['bold']))
    print(time_zone_count)
コード例 #5
0
def process_document(self, documents):
    tokenizer = data.load('tokenizers/punkt/english.pickle')
    lemmatizer = WordNetLemmatizer()
    stopwords = corpus.stopwords.words('english')
    tf = []  # term frequency
    idf = []
    tokens_list_doc_wise = []
    all_tokens = set()

    for document in documents:
        tokens = tokenizer.tokenize(document)
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokens = [token for token in tokens if token not in stopwords]
        tokens_list_doc_wise.append(tokens)
        tf.append(Counter(tokens))
        all_tokens.union(tokens)

    # calculating idf
    for token in all_tokens:
        present_in_documents = 0
        for x in range(0, len(documents)):
            present_in_documents += 1 if tf[x][
                token] > 0 else present_in_documents
        idf[token] = math.log(len(documents) / len(present_in_documents))

    # calculating tf_idf for tokens document wise
    tf_idf = []
    for x in range(0, len(tokens_list_doc_wise)):
        for token in tokens_list_doc_wise[x]:
            tf_idf[token] = tf[x][token] * idf[token]
コード例 #6
0
    def train(self, trainset):
        X = []
        y = []
        self.tfidf = TfidfVectorizer()
        for sent in trainset:
            self.tfidfWordList.append(sent['target_word'])
            for item in sent['target_word'].split(" "):
                self.wordList.append(item)
        #tf model
        self.tfList = Counter(self.wordList)
        max = np.array([item for item in self.tfList.values()])
        self.maxNumber = np.max(max)
        #tfidf model
        weightTfidf = self.tfidf.fit_transform(self.tfidfWordList).toarray()

        zeroVector = np.zeros(len(weightTfidf[0]))
        for item in weightTfidf:
            itemVector = np.array(item)
            zeroVector += itemVector
        self.tfidfResult = dict(zip(self.tfidf.get_feature_names(),
                                    zeroVector))
        # self.tfidfResult = {key:value for key,value in self.tfidf.vocabulary_.items()}
        # self.normal = np.max(np.array([item for item in self.tfidfResult.values()]))
        # print(self.tfList)
        for sent in trainset:
            X.append(self.extract_features(sent['target_word']))
            y.append(sent['gold_label'])
        self.model.fit(X, y)
        title = "TF+TFIDF " + self.language.capitalize()
        self.plot_learning_curve(self.model, title, X, y)
コード例 #7
0
def modified_precision(candidate, references, n):
    candidate_counter = Counter(get_ngrams(candidate, n))
    if not candidate_counter:
        return 0

    max_reference_counter = {}
    for reference in references:
        reference_counter = Counter(get_ngrams(reference, n))
        for ngram in candidate_counter:
            max_reference_counter[ngram] = max(
                max_reference_counter.get(ngram, 0), reference_counter[ngram])

    clipped_counter = dict(
        (ngram, min(reference_counter_count, max_reference_counter[ngram]))
        for ngram, reference_counter_count in reference_counter.items())
    return sum(clipped_counter.values()) / sum(candidate_counter.values())
コード例 #8
0
def td_idf(text_array, neg_word_occ, pos_word_occ):
    for i in text_array:
        word_map = Counter(i)
        for wd in word_map:
            word_map[wd] = log(len(text_array) / (neg_word_occ[wd] + pos_word_occ[wd]), 10) * word_map[wd] / len(i)
        print(word_map)
        break
コード例 #9
0
def countWords(self):
    filteredWords = cleanWords(self.contents)
    counts = Counter(filteredWords)
    wordFreq = dict()
    # Convert to relative frequency
    for c in counts:
        wordFreq[c] = counts[c] / len(filteredWords)
    return wordFreq
コード例 #10
0
def td_idf_to_vec(dataset, dim, neg_word_occ, pos_word_occ):
    result = zeros((len(dataset), dim))
    for i, val in enumerate(dataset):
        word_map = Counter(val)
        for wd in val:
            print(word_map)
            result[i][wd] = log(len(dataset) / (neg_word_occ[wd] + pos_word_occ[wd]), 10) * word_map[wd] / len(val)
    return result
コード例 #11
0
def ngrams_over_file(sourcefile, n, targetfile):
    f = open(targetfile, 'w')
    data_set = read_file(sourcefile)
    split_it = data_set.split()
    split_it = list(filter(lambda x: x not in stopwords, split_it))
    split_it = list(filter(lambda x: len(x) > 2, split_it))
    counter = Counter(split_it)
    if n > 1:
        grams = ngrams(split_it, n)
        counter = Counter(grams)
    most_occur = counter.most_common(20)
    for occur in most_occur:
        if n == 1:
            f.write(occur[0] + "|" + str(occur[1]) + "\n")
        else:
            f.write(' '.join(map(str, occur[0])) + "|" + str(occur[1]) + "\n")
    print(most_occur)
    f.close()
コード例 #12
0
ファイル: Processing.py プロジェクト: joshuacc1/rss_reader
 def getsimiliar(self, word, text):
     T = Text(text)
     word_context_index = ContextIndex(T.tokens,
                                       filter=lambda x: x.isalpha(),
                                       key=lambda s: s.lower())
     word = word.lower()
     wci = word_context_index._word_to_contexts
     words = []
     if word in wci.conditions():
         contexts = set(wci[word])
         fd = Counter(w for w in wci.conditions() for c in wci[w]
                      if c in contexts and not w == word)
         words = [w for w, _ in fd.most_common(20)]
     return words
コード例 #13
0
ファイル: parser.py プロジェクト: LucaPrg/TLN
def cleaning(sentence: str,
             method: str,
             frequency: int = 0,
             percentage: int = 0):
    """
    :param sentence: Definition to clean
    :param method: string which define which method to call
    :param frequency: if not None define minimum number of words repetition
    :param percentage: percentage of the highest frequent words to take
    :return Counter(key=word,value=frequency): sentence cleaned
    """
    tokenized: Counter = rm_stopwords_punctuation(sentence)
    tokenized = utility.remove_number_key(tokenized,
                                          minimum=1950,
                                          maximum=2030)
    if len(tokenized) <= 0:
        return Counter()
    elif frequency > 0:
        # Filtering only words with at least frequency occurrences
        filtered = dict(filter(lambda x: x[1] >= frequency, tokenized.items()))
        i = 1
        while len(filtered) <= 0:
            filtered = dict(
                filter(lambda x: x[1] >= frequency - i, tokenized.items()))
            i += 1
        return globals()[method](Counter(filtered))
    # If a percentage is defined take the first elements (based on percentage), otherwise take everything
    elif percentage > 0:
        percentage = int((percentage / 100) * len(tokenized))
        most_common = tokenized.most_common(percentage)
        tokenized = Counter(
            dict(
                filter(lambda elem: elem[0] in dict(most_common).keys(),
                       tokenized.items())))

    return globals()[method](tokenized)
コード例 #14
0
ファイル: parser.py プロジェクト: LucaPrg/TLN
def rm_stopwords_punctuation(sentence: str,
                             language="english",
                             stamp=False) -> Counter:
    tokens = word_tokenize(sentence)
    if len(tokens) > 0:
        tokens[0] = tokens[0].lower()
    sentence = Counter(tokens)
    stopwords_list = set(stopwords.words(language))
    stop_punctuation = stopwords_list.union(resources.punctuation).union(
        resources.ambiguous)
    filtered = utility.filter_by_set(sentence, stop_punctuation)
    if stamp:
        print("---Removing Stopwords---")
        print("Stopwords in", language, ":", stopwords_list)
        print("Sentence with stopwords and punctuation removed:\n", filtered)
    return filtered
コード例 #15
0
ファイル: main.py プロジェクト: Shivanisingh05/TwitterBot
def Topusage():
    new_tweets = api.user_timeline(screen_name='@narendramodi',
                                   count=200,
                                   tweet_mode='extended')
    for tweet in new_tweets:
        #print(tweet.full_text)
        temp = []
        temp.append(tweet.full_text)
        temp1 = temp
        import re
        words = re.sub(r"http\S+", " ", str(temp1))
        word = words.split()
        word1 = [w for w in word if w in stop_words]
        for w in word1:
            if w not in stop_words:
                word1.append(w)
        num = Counter(word1).most_common(10)
        print(num)
コード例 #16
0
ファイル: nlp_udemy.Py プロジェクト: fubu03/NLP_scripts
namedEnt.draw()


#6 Bag of Words Model-Most common words datacamp 

from nltk import Counter
from nltk.corpus import stopwords
# so far we already have text, lets do a quick bow model:


alpha_tokens_lower= [w.lower() for w in nltk.word_tokenize(text) if w.isalpha()]
no_stops=[w for w in alpha_tokens_lower if w not in stopwords.words('english') ]


len(alpha_tokens_lower)-len(no_stops)
words_count=Counter(no_stops)

words_count.most_common(17)

#7 Bag of words model Udemy:

#we already have the text:
import re
dataset=nltk.sent_tokenize(text)


#lowercase, non-alpha to singe space
for i in range(len(dataset)):
    dataset[i]=dataset[i].lower()
    dataset[i]=re.sub(r'\W', ' ', dataset[i])
    dataset[i]=re.sub(r'\s+',' ',dataset[i])  
コード例 #17
0
 def count_words(cls, items: list):
     counter = Counter(items)
     return counter.most_common(5)
コード例 #18
0
ファイル: text.py プロジェクト: LBJ-Wade/NLOOP
 def _token_counter(self):
     """Return the counts of all tokens"""
     return Counter([word for doc in self.tokens for word in doc])
コード例 #19
0
ファイル: parser.py プロジェクト: LucaPrg/TLN
def lemmer(tokens) -> Counter:
    lemmed = Counter()
    for k in tokens.keys():
        lemmed.update({lemmatizer.lemmatize(k): tokens[k]})
    return lemmed
コード例 #20
0
                    semantic_values = stringate_value(hypernom_subj, hypernom_dobj)
                    semantic_type.append(semantic_values)

    return semantic_type, sentences_analyzed


if __name__ == '__main__':
    verbs_bf = ['build', 'love', 'eat']

    for verb_base_form in verbs_bf:

        sentences = get_sentences_with_verb(verb_base_form)
        print('*' * 50)
        print('\nCurrent verb base form : {}\n'.format(verb_base_form))
        semantic_cluster, sentences_analyzed = get_semantic_cluster(sentences, verb_base_form)
        print('------ End extraction-----------')

        # Print stats
        sts_semantic_cluster = Counter(semantic_cluster)
        common_semantic_cluster = sts_semantic_cluster.most_common(5)
        plot_result(common_semantic_cluster, verb_base_form)

        print('\nAnalized {} sentences \nFor the verb in base form : {} pair of semantic type are:\n'
              .format(sentences_analyzed, verb_base_form))

        for s in sts_semantic_cluster:
            print('\t< {} > Count {} '.format(s, sts_semantic_cluster[s]))

        print('*' * 50)
        print('\n\n\n')
コード例 #21
0
 def statistics(self, trainset):
     for sent in trainset:
         self.wordNumber += len(
             re.sub("[^\w']", " ", sent['sentence']).split())
         self.wordbackup += re.sub("[^\w']", " ", sent['sentence']).split()
     self.wordCounter = Counter(self.wordbackup)
コード例 #22
0
ファイル: bleu.py プロジェクト: ningmengwei-ata/Class-Project
def _modified_precision(candidate, references, n):
    """Calculate modified ngram precision.

    The normal precision method may lead to some wrong translations with
    high-precision, e.g., the translation, in which a word of reference
    repeats several times, has very high precision. So in the modified
    n-gram precision, a reference word will be considered exhausted after
    a matching candidate word is identified.

    Paper examples:

    >>> _modified_precision(
    ...    'the the the the the the the'.split(),
    ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
    ...    n=1,
    ... )
    0.28...

    >>> _modified_precision(
    ...    'the the the the the the the'.split(),
    ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
    ...    n=2,
    ... )
    0.0

    >>> _modified_precision(
    ...    'of the'.split(),
    ...    [
    ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
    ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
    ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
    ...    ],
    ...    n=1,
    ... )
    1.0

    >>> _modified_precision(
    ...    'of the'.split(),
    ...    [
    ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
    ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
    ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
    ...    ],
    ...    n=2,
    ... )
    1.0

    More examples:

    >>> weights = [0.25, 0.25, 0.25, 0.25]
    >>> candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...               'ensures', 'that', 'the', 'military', 'always',
    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']

    >>> candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
    ...               'that', 'party', 'direct']

    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...               'heed', 'Party', 'commands']

    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...               'guarantees', 'the', 'military', 'forces', 'always',
    ...               'being', 'under', 'the', 'command', 'of', 'the',
    ...               'Party']

    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
    ...               'of', 'the', 'party']

    Unigrams:

    >>> _modified_precision(
    ...    candidate1,
    ...    [reference1, reference2, reference3],
    ...    n=1,
    ... )
    0.94...

    >>> _modified_precision(
    ...    candidate2,
    ...    [reference1, reference2, reference3],
    ...    n=1,
    ... )
    0.57...

    Bigrams:

    >>> _modified_precision(
    ...    candidate1,
    ...    [reference1, reference2, reference3],
    ...    n=2,
    ... )
    0.58...

    >>> _modified_precision(
    ...    candidate2,
    ...    [reference1, reference2, reference3],
    ...    n=2,
    ... )
    0.07...

    """
    counts = Counter(ngrams(candidate, n))

    if not counts:
        return 0

    max_counts = {}
    for reference in references:
        reference_counts = Counter(ngrams(reference, n))
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0),
                                    reference_counts[ngram])

    clipped_counts = dict((ngram, min(count, max_counts[ngram]))
                          for ngram, count in counts.items())

    return sum(clipped_counts.values()) / sum(counts.values())
コード例 #23
0
import pandas
import vincent
from nltk import Counter

from DB import db
from Preprocessing import preprocess, stop

count = 0
db = db()

allTweets = db.getAll()

count_all_hashtags = Counter()
count_all_terms = Counter()
dates_hashtag = []
for tweet in allTweets:
    tweetText = tweet['text'].lower()
    # Bigrams list
    termsWithoutStopwords = [
        term for term in preprocess(tweetText) if term not in stop
    ]
    # termsBigrams = bigrams(termsWithoutStopwords)

    # Hashtags list
    terms_hash = [
        term for term in preprocess(tweetText) if term.startswith('#')
    ]
    if '#marchfortruth' in terms_hash:
        dates_hashtag.append(tweet['created_at'])

    # Update the counter(s)
コード例 #24
0
ファイル: parser.py プロジェクト: LucaPrg/TLN
def stemmer(tokens) -> Counter:
    stemmed = Counter()
    for k in tokens.keys():
        stemmed.update({stemmatizer.stem(k): tokens[k]})
    return stemmed
コード例 #25
0
def get_features(text, setting):
    if setting == 'bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}
コード例 #26
0
#count words in list(data_split), if word already in list, pass, but if not,
#add it to the current wordcount dictionary. Counting frequency
for item in data_split:
    if item in wordcount.keys():
        wordcount[item] += 1
    else:
        wordcount[item] = 1

qstring = "I think I will get the best score in the class"
qstring_split = qstring.split()
qstring_dict = {}
for word in qstring_split:
    if word in qstring_dict.keys():
        qstring_dict[word] += 1
    else:
        qstring_dict[word] = 1
#count bigrams in the text file
from nltk import Counter
data_bi = Counter(nltk.bigrams(data_split))
q_bi = Counter(nltk.bigrams(qstring_split))
#count the probability of each word in wordcount dictionary
biprob_list = []
for item in q_bi:
    if item in data_bi:
        biprob_list.append(q_bi[item] / data_bi[item])
    else:
        bi_prob = 0
total_prob = 1
for prob in biprob_list:
    total_prob = total_prob * prob