Python Counterの例、nltk.Counter Pythonの例

コード例 #1

0

ファイルを表示

ファイル: predict_topics.py プロジェクト: dchecks/taggernews

def create_logistic_model(df, _story_ids, data):
    results = []
    lr_models = {}
    for label in set(labels):
        positive_story_ids = set(df >> sift(X["labels"] == label) >> X.story_id.values)
        y_ = np.array([s in positive_story_ids for s in _story_ids])
        X_ = data
        lr = linear_model.LogisticRegression(C=C_VALUE)
        logging.info(label, Counter(y_))

        cv_score = cross_validation.cross_val_score(
            lr, X_, y_, cv=10, scoring="roc_auc").mean()

        lr = lr.fit(X_, y_)
        lr_models[label] = lr
        probs = lr.predict_proba(X_)[:, 1]
        results.append({"alg": "log reg", "label": label, "auc": cv_score})
        logging.info(C_VALUE, label, cv_score, len(probs[probs > 0.19]), Counter(labels == label))
        logging.info()
    results_df = pd.DataFrame(results)

    lr_fname = make_time_filename(LOGISTIC_MODEL_NAME, ".pkl")
    logging.info("writing file", lr_fname)
    with open(lr_fname, "wb") as f:
        pickle.dump(lr_models, f, protocol=2)

コード例 #2

0

ファイルを表示

ファイル: text_features.py プロジェクト: kapil1201/Machine-Learning-and-Python-Work

    def get_features(self, question, context):
        stop = set() if self.stop_words is None else self.stop_words.words
        context_features = np.zeros((len(context), 3))

        if not self.require_unique_match:
            question_words = set(x for x in question if x.lower() not in stop)
            quesiton_words_lower = set(x.lower() for x in question)
            quesiton_words_stem = set(
                self.lemmatize_word(x) for x in quesiton_words_lower)
        else:
            question_words = set(k for k, v in Counter(question).items()
                                 if v == 1)
            quesiton_words_lower = set(k for k, v in Counter(
                x.lower() for x in question_words).items() if v == 1)
            quesiton_words_stem = set(k for k, v in Counter(
                self.lemmatize_word(x) for x in quesiton_words_lower).items()
                                      if v == 1)

        for i, word in enumerate(context):
            if word in question_words:
                context_features[i][:3] = 1
            elif word.lower() in quesiton_words_lower:
                context_features[i][:2] = 1
            elif self._lemmatizer.lemmatize(word) in quesiton_words_stem:
                context_features[i][2] = 1

        if self.empty_question_features:
            return np.zeros((len(question), 3)), context_features
        else:
            return np.zeros((len(question), 0)), context_features

コード例 #3

0

ファイルを表示

ファイル: main.py プロジェクト: Shivanisingh05/TwitterBot

def llt(query):
    print(
        colored(
            "*****************************************************************************************"
            "********************************************************************************************",
            color='magenta'))
    public_tweets = get_tweets(query)
    location = {}
    language = {}
    time_zone = {}
    for tweet in public_tweets['statuses']:
        loc = tweet['user']['location']
        lang = tweet['user']['lang']
        tz = tweet['user']['time_zone']

        if loc in location:
            location[loc] += 1
        else:
            location[loc] = 1
        if lang in language:
            language[lang] += 1
        else:
            language[lang] = 1
        if tz in time_zone:
            time_zone[tz] += 1
        else:
            time_zone[tz] = 1
    if None in time_zone:
        del time_zone[None]
    if '' in time_zone:
        del time_zone['']
    if '' in language:
        del language['']
    if '' in location:
        del location['']
    if None in location:
        del location[None]
    if None in language:
        del language[None]
    language_count = dict(Counter(language).most_common(4))
    print(colored("language: ", color='green', attrs=['bold']))
    print language_count
    location_count = dict(Counter(location).most_common(4))
    print(colored("locations: ", color='green', attrs=['bold']))
    print location_count
    time_zone_count = dict(Counter(time_zone).most_common(4))
    print(colored("Time Zone: ", color='green', attrs=['bold']))
    print time_zone_count
    print(
        colored(
            "*****************************************************************************************"
            "********************************************************************************************",
            color='magenta'))

コード例 #4

0

ファイルを表示

def llt(query):

    public_tweets = get_tweets(query)

    global time_zone1, loca, lang
    location = {}
    language = {}
    time_zone = {}
    for tweet in public_tweets['statuses']:
        loca = tweet['user']['location']
        lang = tweet['user']['lang']
        time_zone1 = tweet['user']['created_at']
        if loca in location:
            location[loca] += 1
        else:
            location[loca] = 1
        if lang in language:
            language[lang] += 1
        else:
            language[lang] = 1
        if time_zone1 in time_zone:
            time_zone[time_zone1] += 1
        else:
            time_zone[time_zone1] = 1

    # limiting the display of the values
    if None in time_zone:
        del time_zone[None]
    if '' in time_zone:
        del time_zone['']
    if '' in language:
        del language['']
    if '' in location:
        del location['']
    if None in location:
        del location[None]
    if None in language:
        del language[None]

    language_count = dict(Counter(language).most_common(5))
    print(colored("Language: ", color='green', attrs=['bold']))
    print(language_count)

    location_count = dict(Counter(location).most_common(5))
    print(colored("Location: ", color='green', attrs=['bold']))
    print(location_count)
    time_zone_count = dict(Counter(time_zone).most_common(5))
    print(colored("Time Zone: ", color='green', attrs=['bold']))
    print(time_zone_count)

コード例 #5

0

ファイルを表示

def process_document(self, documents):
    tokenizer = data.load('tokenizers/punkt/english.pickle')
    lemmatizer = WordNetLemmatizer()
    stopwords = corpus.stopwords.words('english')
    tf = []  # term frequency
    idf = []
    tokens_list_doc_wise = []
    all_tokens = set()

    for document in documents:
        tokens = tokenizer.tokenize(document)
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokens = [token for token in tokens if token not in stopwords]
        tokens_list_doc_wise.append(tokens)
        tf.append(Counter(tokens))
        all_tokens.union(tokens)

    # calculating idf
    for token in all_tokens:
        present_in_documents = 0
        for x in range(0, len(documents)):
            present_in_documents += 1 if tf[x][
                token] > 0 else present_in_documents
        idf[token] = math.log(len(documents) / len(present_in_documents))

    # calculating tf_idf for tokens document wise
    tf_idf = []
    for x in range(0, len(tokens_list_doc_wise)):
        for token in tokens_list_doc_wise[x]:
            tf_idf[token] = tf[x][token] * idf[token]

コード例 #6

0

ファイルを表示

ファイル: baselineTF.py プロジェクト: Tadelaide/cwisharedtask2018-teaching

    def train(self, trainset):
        X = []
        y = []
        self.tfidf = TfidfVectorizer()
        for sent in trainset:
            self.tfidfWordList.append(sent['target_word'])
            for item in sent['target_word'].split(" "):
                self.wordList.append(item)
        #tf model
        self.tfList = Counter(self.wordList)
        max = np.array([item for item in self.tfList.values()])
        self.maxNumber = np.max(max)
        #tfidf model
        weightTfidf = self.tfidf.fit_transform(self.tfidfWordList).toarray()

        zeroVector = np.zeros(len(weightTfidf[0]))
        for item in weightTfidf:
            itemVector = np.array(item)
            zeroVector += itemVector
        self.tfidfResult = dict(zip(self.tfidf.get_feature_names(),
                                    zeroVector))
        # self.tfidfResult = {key:value for key,value in self.tfidf.vocabulary_.items()}
        # self.normal = np.max(np.array([item for item in self.tfidfResult.values()]))
        # print(self.tfList)
        for sent in trainset:
            X.append(self.extract_features(sent['target_word']))
            y.append(sent['gold_label'])
        self.model.fit(X, y)
        title = "TF+TFIDF " + self.language.capitalize()
        self.plot_learning_curve(self.model, title, X, y)

コード例 #7

0

ファイルを表示

def modified_precision(candidate, references, n):
    candidate_counter = Counter(get_ngrams(candidate, n))
    if not candidate_counter:
        return 0

    max_reference_counter = {}
    for reference in references:
        reference_counter = Counter(get_ngrams(reference, n))
        for ngram in candidate_counter:
            max_reference_counter[ngram] = max(
                max_reference_counter.get(ngram, 0), reference_counter[ngram])

    clipped_counter = dict(
        (ngram, min(reference_counter_count, max_reference_counter[ngram]))
        for ngram, reference_counter_count in reference_counter.items())
    return sum(clipped_counter.values()) / sum(candidate_counter.values())

コード例 #8

0

ファイルを表示

ファイル: func.py プロジェクト: OleksandrKotyk/PracaLicencjacka

def td_idf(text_array, neg_word_occ, pos_word_occ):
    for i in text_array:
        word_map = Counter(i)
        for wd in word_map:
            word_map[wd] = log(len(text_array) / (neg_word_occ[wd] + pos_word_occ[wd]), 10) * word_map[wd] / len(i)
        print(word_map)
        break

コード例 #9

0

ファイルを表示

ファイル: nlp.py プロジェクト: TheFirstQuestion/cs106X-final-project

def countWords(self):
    filteredWords = cleanWords(self.contents)
    counts = Counter(filteredWords)
    wordFreq = dict()
    # Convert to relative frequency
    for c in counts:
        wordFreq[c] = counts[c] / len(filteredWords)
    return wordFreq

コード例 #10

0

ファイルを表示

ファイル: func.py プロジェクト: OleksandrKotyk/PracaLicencjacka

def td_idf_to_vec(dataset, dim, neg_word_occ, pos_word_occ):
    result = zeros((len(dataset), dim))
    for i, val in enumerate(dataset):
        word_map = Counter(val)
        for wd in val:
            print(word_map)
            result[i][wd] = log(len(dataset) / (neg_word_occ[wd] + pos_word_occ[wd]), 10) * word_map[wd] / len(val)
    return result

コード例 #11

0

ファイルを表示

def ngrams_over_file(sourcefile, n, targetfile):
    f = open(targetfile, 'w')
    data_set = read_file(sourcefile)
    split_it = data_set.split()
    split_it = list(filter(lambda x: x not in stopwords, split_it))
    split_it = list(filter(lambda x: len(x) > 2, split_it))
    counter = Counter(split_it)
    if n > 1:
        grams = ngrams(split_it, n)
        counter = Counter(grams)
    most_occur = counter.most_common(20)
    for occur in most_occur:
        if n == 1:
            f.write(occur[0] + "|" + str(occur[1]) + "\n")
        else:
            f.write(' '.join(map(str, occur[0])) + "|" + str(occur[1]) + "\n")
    print(most_occur)
    f.close()

コード例 #12

0

ファイルを表示

ファイル: Processing.py プロジェクト: joshuacc1/rss_reader

 def getsimiliar(self, word, text):
     T = Text(text)
     word_context_index = ContextIndex(T.tokens,
                                       filter=lambda x: x.isalpha(),
                                       key=lambda s: s.lower())
     word = word.lower()
     wci = word_context_index._word_to_contexts
     words = []
     if word in wci.conditions():
         contexts = set(wci[word])
         fd = Counter(w for w in wci.conditions() for c in wci[w]
                      if c in contexts and not w == word)
         words = [w for w, _ in fd.most_common(20)]
     return words

コード例 #13

0

ファイルを表示

ファイル: parser.py プロジェクト: LucaPrg/TLN

def cleaning(sentence: str,
             method: str,
             frequency: int = 0,
             percentage: int = 0):
    """
    :param sentence: Definition to clean
    :param method: string which define which method to call
    :param frequency: if not None define minimum number of words repetition
    :param percentage: percentage of the highest frequent words to take
    :return Counter(key=word,value=frequency): sentence cleaned
    """
    tokenized: Counter = rm_stopwords_punctuation(sentence)
    tokenized = utility.remove_number_key(tokenized,
                                          minimum=1950,
                                          maximum=2030)
    if len(tokenized) <= 0:
        return Counter()
    elif frequency > 0:
        # Filtering only words with at least frequency occurrences
        filtered = dict(filter(lambda x: x[1] >= frequency, tokenized.items()))
        i = 1
        while len(filtered) <= 0:
            filtered = dict(
                filter(lambda x: x[1] >= frequency - i, tokenized.items()))
            i += 1
        return globals()[method](Counter(filtered))
    # If a percentage is defined take the first elements (based on percentage), otherwise take everything
    elif percentage > 0:
        percentage = int((percentage / 100) * len(tokenized))
        most_common = tokenized.most_common(percentage)
        tokenized = Counter(
            dict(
                filter(lambda elem: elem[0] in dict(most_common).keys(),
                       tokenized.items())))

    return globals()[method](tokenized)

コード例 #14

0

ファイルを表示

ファイル: parser.py プロジェクト: LucaPrg/TLN

def rm_stopwords_punctuation(sentence: str,
                             language="english",
                             stamp=False) -> Counter:
    tokens = word_tokenize(sentence)
    if len(tokens) > 0:
        tokens[0] = tokens[0].lower()
    sentence = Counter(tokens)
    stopwords_list = set(stopwords.words(language))
    stop_punctuation = stopwords_list.union(resources.punctuation).union(
        resources.ambiguous)
    filtered = utility.filter_by_set(sentence, stop_punctuation)
    if stamp:
        print("---Removing Stopwords---")
        print("Stopwords in", language, ":", stopwords_list)
        print("Sentence with stopwords and punctuation removed:\n", filtered)
    return filtered

コード例 #15

0

ファイルを表示

ファイル: main.py プロジェクト: Shivanisingh05/TwitterBot

def Topusage():
    new_tweets = api.user_timeline(screen_name='@narendramodi',
                                   count=200,
                                   tweet_mode='extended')
    for tweet in new_tweets:
        #print(tweet.full_text)
        temp = []
        temp.append(tweet.full_text)
        temp1 = temp
        import re
        words = re.sub(r"http\S+", " ", str(temp1))
        word = words.split()
        word1 = [w for w in word if w in stop_words]
        for w in word1:
            if w not in stop_words:
                word1.append(w)
        num = Counter(word1).most_common(10)
        print(num)

コード例 #16

0

ファイルを表示

ファイル: nlp_udemy.Py プロジェクト: fubu03/NLP_scripts

namedEnt.draw()


#6 Bag of Words Model-Most common words datacamp 

from nltk import Counter
from nltk.corpus import stopwords
# so far we already have text, lets do a quick bow model:


alpha_tokens_lower= [w.lower() for w in nltk.word_tokenize(text) if w.isalpha()]
no_stops=[w for w in alpha_tokens_lower if w not in stopwords.words('english') ]


len(alpha_tokens_lower)-len(no_stops)
words_count=Counter(no_stops)

words_count.most_common(17)

#7 Bag of words model Udemy:

#we already have the text:
import re
dataset=nltk.sent_tokenize(text)


#lowercase, non-alpha to singe space
for i in range(len(dataset)):
    dataset[i]=dataset[i].lower()
    dataset[i]=re.sub(r'\W', ' ', dataset[i])
    dataset[i]=re.sub(r'\s+',' ',dataset[i])

コード例 #17

0

ファイルを表示

 def count_words(cls, items: list):
     counter = Counter(items)
     return counter.most_common(5)

コード例 #18

0

ファイルを表示

ファイル: text.py プロジェクト: LBJ-Wade/NLOOP

 def _token_counter(self):
     """Return the counts of all tokens"""
     return Counter([word for doc in self.tokens for word in doc])

コード例 #19

0

ファイルを表示

ファイル: parser.py プロジェクト: LucaPrg/TLN

def lemmer(tokens) -> Counter:
    lemmed = Counter()
    for k in tokens.keys():
        lemmed.update({lemmatizer.lemmatize(k): tokens[k]})
    return lemmed

コード例 #20

0

ファイルを表示

ファイル: hanks_theory.py プロジェクト: FrancescoIodicedev/TLN_StatisticalApproaches

                    semantic_values = stringate_value(hypernom_subj, hypernom_dobj)
                    semantic_type.append(semantic_values)

    return semantic_type, sentences_analyzed


if __name__ == '__main__':
    verbs_bf = ['build', 'love', 'eat']

    for verb_base_form in verbs_bf:

        sentences = get_sentences_with_verb(verb_base_form)
        print('*' * 50)
        print('\nCurrent verb base form : {}\n'.format(verb_base_form))
        semantic_cluster, sentences_analyzed = get_semantic_cluster(sentences, verb_base_form)
        print('------ End extraction-----------')

        # Print stats
        sts_semantic_cluster = Counter(semantic_cluster)
        common_semantic_cluster = sts_semantic_cluster.most_common(5)
        plot_result(common_semantic_cluster, verb_base_form)

        print('\nAnalized {} sentences \nFor the verb in base form : {} pair of semantic type are:\n'
              .format(sentences_analyzed, verb_base_form))

        for s in sts_semantic_cluster:
            print('\t< {} > Count {} '.format(s, sts_semantic_cluster[s]))

        print('*' * 50)
        print('\n\n\n')

コード例 #21

0

ファイルを表示

ファイル: baseline.py プロジェクト: Tadelaide/cwisharedtask2018-teaching

 def statistics(self, trainset):
     for sent in trainset:
         self.wordNumber += len(
             re.sub("[^\w']", " ", sent['sentence']).split())
         self.wordbackup += re.sub("[^\w']", " ", sent['sentence']).split()
     self.wordCounter = Counter(self.wordbackup)

コード例 #22

0

ファイルを表示

ファイル: bleu.py プロジェクト: ningmengwei-ata/Class-Project

def _modified_precision(candidate, references, n):
    """Calculate modified ngram precision.

    The normal precision method may lead to some wrong translations with
    high-precision, e.g., the translation, in which a word of reference
    repeats several times, has very high precision. So in the modified
    n-gram precision, a reference word will be considered exhausted after
    a matching candidate word is identified.

    Paper examples:

    >>> _modified_precision(
    ...    'the the the the the the the'.split(),
    ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
    ...    n=1,
    ... )
    0.28...

    >>> _modified_precision(
    ...    'the the the the the the the'.split(),
    ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
    ...    n=2,
    ... )
    0.0

    >>> _modified_precision(
    ...    'of the'.split(),
    ...    [
    ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
    ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
    ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
    ...    ],
    ...    n=1,
    ... )
    1.0

    >>> _modified_precision(
    ...    'of the'.split(),
    ...    [
    ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
    ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
    ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
    ...    ],
    ...    n=2,
    ... )
    1.0

    More examples:

    >>> weights = [0.25, 0.25, 0.25, 0.25]
    >>> candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...               'ensures', 'that', 'the', 'military', 'always',
    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']

    >>> candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
    ...               'that', 'party', 'direct']

    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...               'heed', 'Party', 'commands']

    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...               'guarantees', 'the', 'military', 'forces', 'always',
    ...               'being', 'under', 'the', 'command', 'of', 'the',
    ...               'Party']

    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
    ...               'of', 'the', 'party']

    Unigrams:

    >>> _modified_precision(
    ...    candidate1,
    ...    [reference1, reference2, reference3],
    ...    n=1,
    ... )
    0.94...

    >>> _modified_precision(
    ...    candidate2,
    ...    [reference1, reference2, reference3],
    ...    n=1,
    ... )
    0.57...

    Bigrams:

    >>> _modified_precision(
    ...    candidate1,
    ...    [reference1, reference2, reference3],
    ...    n=2,
    ... )
    0.58...

    >>> _modified_precision(
    ...    candidate2,
    ...    [reference1, reference2, reference3],
    ...    n=2,
    ... )
    0.07...

    """
    counts = Counter(ngrams(candidate, n))

    if not counts:
        return 0

    max_counts = {}
    for reference in references:
        reference_counts = Counter(ngrams(reference, n))
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0),
                                    reference_counts[ngram])

    clipped_counts = dict((ngram, min(count, max_counts[ngram]))
                          for ngram, count in counts.items())

    return sum(clipped_counts.values()) / sum(counts.values())

コード例 #23

0

ファイルを表示

ファイル: Plotting.py プロジェクト: alextsil/twitter-topic-detection-and-analysis

import pandas
import vincent
from nltk import Counter

from DB import db
from Preprocessing import preprocess, stop

count = 0
db = db()

allTweets = db.getAll()

count_all_hashtags = Counter()
count_all_terms = Counter()
dates_hashtag = []
for tweet in allTweets:
    tweetText = tweet['text'].lower()
    # Bigrams list
    termsWithoutStopwords = [
        term for term in preprocess(tweetText) if term not in stop
    ]
    # termsBigrams = bigrams(termsWithoutStopwords)

    # Hashtags list
    terms_hash = [
        term for term in preprocess(tweetText) if term.startswith('#')
    ]
    if '#marchfortruth' in terms_hash:
        dates_hashtag.append(tweet['created_at'])

    # Update the counter(s)

コード例 #24

0

ファイルを表示

ファイル: parser.py プロジェクト: LucaPrg/TLN

def stemmer(tokens) -> Counter:
    stemmed = Counter()
    for k in tokens.keys():
        stemmed.update({stemmatizer.stem(k): tokens[k]})
    return stemmed

コード例 #25

0

ファイルを表示

ファイル: testing_1.py プロジェクト: R3dFruitRollUp/PythonDataMining

def get_features(text, setting):
    if setting == 'bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}

コード例 #26

0

ファイルを表示

#count words in list(data_split), if word already in list, pass, but if not,
#add it to the current wordcount dictionary. Counting frequency
for item in data_split:
    if item in wordcount.keys():
        wordcount[item] += 1
    else:
        wordcount[item] = 1

qstring = "I think I will get the best score in the class"
qstring_split = qstring.split()
qstring_dict = {}
for word in qstring_split:
    if word in qstring_dict.keys():
        qstring_dict[word] += 1
    else:
        qstring_dict[word] = 1
#count bigrams in the text file
from nltk import Counter
data_bi = Counter(nltk.bigrams(data_split))
q_bi = Counter(nltk.bigrams(qstring_split))
#count the probability of each word in wordcount dictionary
biprob_list = []
for item in q_bi:
    if item in data_bi:
        biprob_list.append(q_bi[item] / data_bi[item])
    else:
        bi_prob = 0
total_prob = 1
for prob in biprob_list:
    total_prob = total_prob * prob