Python LexicalRichness Beispiele, lexicalrichness.LexicalRichness Python Beispiele

Beispiel #1

0

Datei anzeigen

    def _compute_lexicalRichness(text, lemmatized_test, word_count,
                                 unique_words_cnt, lexical_diversity):
        lex = LexicalRichness(text)
        lex_lemmatized = LexicalRichness(lemmatized_test)
        # word count
        word_count.append(lex.words)
        # unique term count
        unique_words_cnt.append(lex.terms)
        # measure of Textual Lexical Diversity
        lexical_diversity.append(
            float(lex_lemmatized.terms) / float(lex_lemmatized.words))

        return word_count, unique_words_cnt, lexical_diversity

Beispiel #2

0

Datei anzeigen

Datei: buildFeatures.py Projekt: Khamar-Uz-Zama/ATiML-Semester-Assignment

def extractLexicalRichness():
    """
    Extract lexical richness of the text documents.
    The original text is used here too.
    """
    
    Path1 = 'Gutenberg_English_Fiction_1k'
    Path2 = 'Gutenberg_English_Fiction_1k'
    HTMLFilesPath = 'Gutenberg_19th_century_English_Fiction'
    lexScores = []
    badIndexes = []
    dataPath = os.path.join(os.getcwd(),Path1,Path2, HTMLFilesPath)
    data = pp.readIndexes()
    
    for i in range(len(data)):
        print(i)
        htmlFilePath = os.path.join(dataPath,data['book_id'][i])[:-5] + '-content.html'
        corpus = pp.readHTMLFile(htmlFilePath)
        if corpus:
            lex = LexicalRichness(corpus)
            del lex.wordlist
            del lex.text
            
            lexScores.append(lex)
        else:
            badIndexes.append(i)
            
    with open(lexRichFile, 'wb') as f:
        pickle.dump(lexScores,f)

Beispiel #3

0

Datei anzeigen

Datei: features.py Projekt: abigailmunsen/review-helpfulness-prediction

def type_token_ratio(row):
    text = row['reviewText']
    lex = LexicalRichness(text)
    try:
        return lex.ttr
    except:
        return 0

Beispiel #4

0

Datei anzeigen

Datei: lexical_variation.py Projekt: AldiT/essayevaluation

    def __call__(self, doc):
        # check if the tagger component is part of the pipeline
        if not doc.is_tagged:
            warnings.warn(
                'The spaCy document was not tagged. Please add the TAGGER component or else the '
                'lexical variation indices will be based on all tokens and not just lexical ones!',
                Warning)
            lex = LexicalRichness(doc.text)

        else:
            # remove all non lexical words
            self.mark_lexical_token(doc)
            list_lexical_words = [
                str(token) for token in doc if token._.is_lexical
            ]

            lexical_word_text = ' '.join(list_lexical_words)

            lex = LexicalRichness(lexical_word_text)

        doc._.features['LV_W'] = self.get_lv_w(lex)
        doc._.features['LV_WT'] = self.get_lv_wt(lex)
        doc._.features['LV_WT1'] = self.get_lv_wt1(lex)
        doc._.features['LV_TTR'] = self.get_lv_ttr(lex)
        doc._.features['LV_CTTR'] = self.get_lv_cttr(lex)
        doc._.features['LV_RTTR'] = self.get_lv_rttr(lex)
        doc._.features['LV_HDD'] = self.get_lv_hdd(lex)
        doc._.features['LV_DUGA'] = self.get_lv_duga(lex)
        doc._.features['LV_MAAS'] = self.get_lv_maas(lex)
        doc._.features['LV_SUMM'] = self.get_lv_summ(lex)
        doc._.features['LV_YULEK'] = self.get_lv_yulek(lex)
        doc._.features['LV_MTLD'] = self.get_lv_mtld(lex)
        doc._.features['LV_MSTTR'] = self.get_lv_msttr(lex)
        doc._.features['LV_MATTR'] = self.get_lv_mattr(lex)

        # deprecated! do not use doc._.features_lv anymore, use doc._.features instead!
        doc._.features_lv = [
            doc._.features['LV_W'], doc._.features['LV_WT'],
            doc._.features['LV_WT1'], doc._.features['LV_TTR'],
            doc._.features['LV_CTTR'], doc._.features['LV_RTTR'],
            doc._.features['LV_HDD'], doc._.features['LV_DUGA'],
            doc._.features['LV_MAAS'], doc._.features['LV_SUMM'],
            doc._.features['LV_YULEK'], doc._.features['LV_MTLD'],
            doc._.features['LV_MSTTR'], doc._.features['LV_MATTR']
        ]

        return doc

Beispiel #5

0

Datei anzeigen

Datei: text_scores.py Projekt: GrzegorzMika/MachineLearning

def lexical_richness(data):
    logging.info(f"Processing score lexical richness")
    ttr = []
    for doc in data['text']:
        lex = LexicalRichness(doc)
        ttr.append(lex.ttr)
    data['lexical_richness'] = ttr
    return data

Beispiel #6

0

Datei anzeigen

Datei: features.py Projekt: dzenilee/presidential

 def get_mtld(doc):
     segment = doc.text
     # measure of textual lexical diversity (McCarthy 2005, McCarthy and
     # Jarvis 2010)
     if re.search("[a-zA-Z]", segment) is None:  # e.g., "2", "223)."
         return 0
     else:
         return LexicalRichness(segment).mtld(threshold=0.72)

Beispiel #7

0

Datei anzeigen

def get_lexical_richness(sentence: str):

    lex = LexicalRichness(sentence)
    results = 0

    try:
        results = lex.ttr
    except ZeroDivisionError:
        # Contains no effective words, return 0 instead
        pass
    finally:
        return results

Beispiel #8

0

Datei anzeigen

Datei: LexicalRichnessCalculator.py Projekt: LiatNativPersonal/Cognates

 def calculate_lexical_richness_measure(self, text, window_size = 200, threshold = 0.72):
     lex = LexicalRichness(text)
     self.measures['mattr'] = lex.mattr(window_size=window_size) #moving average
     self.measures['mtld'] = lex.mtld(threshold=threshold) #measure of lexical diversity
     return self.measures

Beispiel #9

0

Datei anzeigen

Datei: nlp_util.py Projekt: vjbytes102/open_dbm

def process_speech(transcribe_df, r_config):
    """
        Preparing speech features
        Args:
            transcribe_df: Transcribed dataframe
            r_config: raw config file object
        Returns:
            Dataframe for speech features
    """

    err_transcribe = transcribe_df[r_config.err_reason].iloc[0]
    transcribe = transcribe_df[r_config.nlp_transcribe].iloc[0]
    total_time = transcribe_df[r_config.nlp_totalTime].iloc[0]
    master_url = transcribe_df['dbm_master_url'].iloc[0]

    #clean transcribe
    transcribe = transcribe.replace(",", "")
    transcribe = " ".join(re.findall(r"[\w']+|[.!?]", transcribe))

    if err_transcribe != 'Pass':
        df_speech = empty_speech(r_config, master_url, error_txt)

        return df_speech

    speech_dict = {}
    nltk_download()

    sentences = nltk.tokenize.sent_tokenize(transcribe)
    words_all = nltk.tokenize.word_tokenize(transcribe)
    num_sentences = len(sentences)

    speech_dict[r_config.nlp_numSentences] = num_sentences

    #nlp_singPron
    i_s = transcribe.count('I')
    me_s = transcribe.count('me')
    my_s = transcribe.count('my')
    sing_count = i_s + me_s + my_s

    speech_dict[r_config.nlp_singPronPerAns] = sing_count if len(
        words_all) > 0 else np.nan
    speech_dict[r_config.nlp_singPronPerSen] = divide_var(
        speech_dict[r_config.nlp_singPronPerAns], num_sentences)

    tagged = nltk.pos_tag(transcribe.split())
    tagged_df = pd.DataFrame(tagged, columns=['word', 'pos_tag'])

    #Past tense per answer
    all_POSs = tagged_df['pos_tag'].tolist()
    speech_dict[r_config.nlp_pastTensePerAns] = all_POSs.count(
        'VBD') if len(words_all) > 0 else np.nan
    speech_dict[r_config.nlp_pastTensePerSen] = divide_var(
        speech_dict[r_config.nlp_pastTensePerAns], num_sentences)

    #Pronoun per answer
    pronounsPerAns = all_POSs.count('PRP') + all_POSs.count('PRP$')
    speech_dict[r_config.nlp_pronounsPerAns] = pronounsPerAns if len(
        words_all) > 0 else np.nan
    speech_dict[r_config.nlp_pronounsPerSen] = divide_var(
        speech_dict[r_config.nlp_pronounsPerAns], num_sentences)

    #Verb per answer
    verbPerAns = all_POSs.count('VB') + all_POSs.count('VBD') + all_POSs.count('VBG') \
                      + all_POSs.count('VBN') + all_POSs.count('VBP') + all_POSs.count('VBZ')
    speech_dict[r_config.
                nlp_verbsPerAns] = verbPerAns if len(words_all) > 0 else np.nan
    speech_dict[r_config.nlp_verbsPerSen] = divide_var(
        speech_dict[r_config.nlp_verbsPerAns], num_sentences)

    #Adjective per answer
    adjectivesAns = all_POSs.count('JJ') + all_POSs.count(
        'JJR') + all_POSs.count('JJS')
    speech_dict[r_config.nlp_adjectivesPerAns] = adjectivesAns if len(
        words_all) > 0 else np.nan
    speech_dict[r_config.nlp_adjectivesPerSen] = divide_var(
        speech_dict[r_config.nlp_adjectivesPerAns], num_sentences)

    #Noun per answer
    nounsAns = all_POSs.count('NN') + all_POSs.count('NNP') + all_POSs.count(
        'NNS')
    speech_dict[
        r_config.nlp_nounsPerAns] = nounsAns if len(words_all) > 0 else np.nan
    speech_dict[r_config.nlp_nounsPerSen] = divide_var(
        speech_dict[r_config.nlp_nounsPerAns], num_sentences)

    #Sentiment analysis
    vader = SentimentIntensityAnalyzer()
    sentence_valences = []

    for s in sentences:
        sentiment_dict = vader.polarity_scores(s)
        sentence_valences.append(sentiment_dict['compound'])

    speech_dict[r_config.nlp_sentiment_mean] = np.mean(
        sentence_valences) if len(sentence_valences) > 0 else np.nan
    non_punc = list(value for value in words_all
                    if value not in ['.', '!', '?'])

    non_punc_as_str = " ".join(str(non_punc))
    lex = LexicalRichness(non_punc_as_str)
    speech_dict[r_config.nlp_mattr] = lex.mattr(
        window_size=lex.words) if lex.words > 0 else np.nan

    #Number of words per minute
    speech_dict[r_config.nlp_wordsPerMin] = divide_var(len(non_punc),
                                                       total_time) * 60
    speech_dict[r_config.nlp_totalTime] = total_time
    speech_dict['dbm_master_url'] = master_url

    df_speech = pd.DataFrame([speech_dict])
    return df_speech

Beispiel #10

0

Datei anzeigen

Datei: text_processor.py Projekt: douglascastrorj/fakenews_identification

    def process_dataset(self,
                        dataset,
                        remove_stop_words=False,
                        stem=False,
                        remove_punct=False,
                        n_gram=1,
                        tags=False,
                        pos=False,
                        dep=False,
                        alpha=False,
                        ent=False,
                        sentiment=False,
                        vectorizer='count',
                        lex=False,
                        normalize=False,
                        tag_ngram=False,
                        text_features=False):

        # return processed_corpus
        if vectorizer == 'tfidf':
            # self.vectorizer = TfidfVectorizer( ngram_range=(1, n_gram), max_df=0.5, min_df=2 )
            self.vectorizer = TfidfVectorizer()
            # n_gram = 1
        else:
            self.vectorizer = CountVectorizer()

        processed_corpus = [
            self.proccess_text(text,
                               remove_stop_words=remove_stop_words,
                               stem=stem,
                               remove_punct=remove_punct,
                               n_gram=n_gram,
                               tags=tags,
                               pos=pos,
                               dep=dep,
                               alpha=alpha,
                               ent=ent,
                               sentiment=sentiment,
                               tag_ngram=tag_ngram) for text in dataset
        ]
        if vectorizer == None:
            return processed_corpus

        X = self.vectorizer.fit_transform(processed_corpus)
        X = X.toarray()

        if normalize:
            X = preprocessing.normalize(X)

        if lex:
            lex_features = []
            for text in dataset:
                lex = LexicalRichness(text)
                li = []
                try:
                    li.append(lex.ttr)
                except:
                    li.append(0.0)
                try:
                    li.append(lex.rttr)
                except:
                    li.append(0.0)
                try:
                    li.append(lex.cttr)
                except:
                    li.append(0.0)
                try:
                    li.append(lex.mtld(threshold=0.72))
                except:
                    li.append(0.0)
                lex_features.append(li)
            lex_features = np.array(lex_features)
            if normalize:
                lex_features = preprocessing.normalize(lex_features)

            _text_features = []
            if (text_features):
                for text in dataset:
                    li = []
                    li.append(self.countChar(text))
                    li.append(self.countCharWithoutSpace(text))
                    li.append(self.countSentences(text))
                    li.append(self.avarageSentenceLength(text))
                    li.append(self.maxSentenceLength(text))
                    li.append(self.minSentenceLength(text))
                    _text_features.append(li)

            X = np.concatenate((X, lex_features), axis=1)
            X = np.concatenate((X, _text_features), axis=1)

        # print(len(self.vectorizer.get_feature_names()), '- Vocabulary\n\n')
        return X

Beispiel #11

0

Datei anzeigen

Datei: corpus_stats_viz.py Projekt: GillesJ/sentivent_webannoparser

def compute_lexical_richness(events,
                             by=["event_type"],
                             extent=["discontiguous_triggers"],
                             preproc=None):
    """
    Compute lexical richness measures of unit attributes.
    event_type extracts the mention tokens

    :return:
    :param events: list of Event objects
    :param by: Group metric by attribute name. Used for grouping by event_type or subtype
    :param extent: Extent of the text getter functions on Event. Default: Full even trigger with discont.,
    :param preproc: list of preprocessing functions that take a string of text as input.
    :return:
    """
    from lexicalrichness import LexicalRichness

    print(
        f"Computing lexical richness of {str(extent).upper()} grouped by {str(by).upper()} with preprocessing: {str(preproc)}"
    )
    # collect text by attribute
    all_text = {}
    for attrib_name, g in groupby(events,
                                  key=lambda x:
                                  (getattr(x, attrib_n) for attrib_n in by)):
        attrib_name = ".".join(str(attrib_name))

        for event in g:
            text = event.get_extent_text(extent=extent)
            if preproc:
                for preproc_func in preproc:
                    text = preproc_func(text)
            all_text.setdefault(attrib_name, []).append(text)

    # compute lexical diversity by attribute
    d = []
    for attrib_name, text in all_text.items():
        # This was a bad idea because mention TTR is nearly always 1.
        # # mean mention type-token ratio: variant of mean segment ttr (Johnsson 1944)
        # # instead of to_fix segments: annotation mentions
        # mention_ttr = [LexicalRichness(t).ttr for t in text]
        # mmttr = sum(mention_ttr) / len(text)
        # print(mention_ttr)
        # print(mmttr)

        # Lexical entropy
        p, lns = Counter(text), float(len(text))
        entropy = -sum(count / lns * math.log(count / lns, 2)
                       for count in p.values())

        # metrics on all mentions together
        text = " ".join(text)
        lr = LexicalRichness(text)

        d.append({
            "annotation_type": attrib_name,
            # "Mean mention TTR": mmttr, # this was a bad idea of mine
            "cttr": lr.cttr,
            "entropy": entropy,
            "dugast": lr.Dugast,
            "type_count": lr.terms,
            "token_count": lr.words,
            "herdan": lr.Herdan,
            "somers": lr.Summer,
            "maas": lr.Maas,  #  low sensivitty
            "ttr": lr.ttr,
            "rttr": lr.rttr,
            "mtld": lr.mtld(threshold=0.72),  # length correct,  mid sensivitty
            "msttr":
            lr.msttr(segment_window=25),  # length correct, mid sensivity
            "mattr":
            lr.mattr(window_size=25),  # length correct, mid sensivitty
            "hdd": lr.hdd(draws=42),  # length correct, high sensitivity
        })

    df_lr = pd.DataFrame(d)
    # invert Maas for plotting
    df_lr["maas_inv"] = df_lr["maas"] * -1.0

    rec_metrics = ["maas", "hdd",
                   "mtld"]  # recommended metrics in McCarthy 2010
    # rank
    df_lr = util.rank_dataframe_column(
        df_lr, ascending=False)  # add rank column for easy comparison
    df_lr["maas_rank"] = (df_lr["maas"].rank().astype(int)
                          )  # Maas is inverted, lower score is more richness
    df_lr = df_lr.drop(labels=["annotation_type_rank"],
                       axis=1)  # no need for index column ranking

    # nicer output
    df_lr = df_lr.sort_index(axis=1)  # sort columns alphabetically
    rank_cols = [c for c in df_lr if "_rank" in c and "_count" not in c]
    df_lr["rank_all"] = (df_lr[rank_cols].sum(axis=1).rank().astype(int)
                         )  # sum every metric rank and rank inversely
    df_lr["rank_maas_hdd_mtld"] = (df_lr[[m + "_rank" for m in rec_metrics
                                          ]].sum(axis=1).rank().astype(int)
                                   )  # combine recommended metrics
    df_lr = df_lr.set_index("annotation_type")
    df_lr = df_lr.sort_values(
        by="rank_maas_hdd_mtld"
    )  # sort values by conbination of recommended metrics in McCarthy 2010
    return df_lr

Beispiel #12

0

Datei anzeigen

def preprocess(df_total):
    """Preprocessing article text : avergae length of sentence,
     frequency of tags, POS tagging"""
    # Cleaning text
    df_total["text"] = df_total.text.apply(lambda x: x.lower())
    # table = str.maketrans('', '', string.punctuation)
    # df_total["text"] = df_total.text.apply(lambda x: x.translate(table))
    df_total["text"] = df_total.text.apply(lambda x: re.sub(r'\d+', 'num', x))

    # substituting "U.S."
    df_total["little_clean"] = df_total.text.apply(
        lambda x: re.sub("U.S.", "United States", x))

    # cleaning text
    table_ = str.maketrans('', '')
    df_total['cleaned_text'] = df_total.text.str.translate(table_)

    # *******SYNTACTIC FEATURES *******#

    # splitting articles into sentences
    df_total["sentences"] = df_total.little_clean.str.split("\. ")

    # calculating num of sentences in each article
    df_total["num_of_sentences"] = df_total.sentences.apply(lambda x: len(x))

    # average length of sentences
    df_total["avg_sentence_length"] = df_total.sentences.apply(
        lambda x: round(np.mean([len(item) for item in x])))

    # POS Tagging
    df_total['POS_tags'] = df_total.cleaned_text.apply(
        lambda x: nltk.pos_tag(nltk.word_tokenize(x), tagset='universal'))

    # frequency of tags
    df_total["tag_fq"] = df_total.POS_tags.apply(
        lambda x: nltk.FreqDist(tag for (word, tag) in x))

    # count of each tag in each article
    df_total['Noun'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['NOUN'])
    df_total['Verb'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['VERB'])
    df_total['Punctuation'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['.'])
    df_total['Adposition'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['ADP'])
    df_total['Determiner'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['DET'])
    df_total['Adjective'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['ADJ'])
    df_total['Particle'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['PRT'])
    df_total['Adverb'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['ADV'])
    df_total['Pronoun'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['PRON'])
    df_total['Conjunction'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['CONJ'])
    df_total['Numeral'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['NUM'])
    df_total['Other'] = df_total.tag_fq.apply(
        lambda x: pos(x.most_common())['X'])

    # *********LEXICAL FEATURES **********#

    # word count
    df_total['characters_count'] = df_total.text.str.len()

    # Filtering only large texts
    df_total = df_total.loc[df_total.characters_count >= 100]

    # word average
    df_total['word_average'] = df_total['text'].apply(
        lambda x: np.mean([len(w) for w in x.split(' ')]))

    # lexical diversity
    df_total['lexical_diversity'] = df_total.text.apply(
        lambda x: ld.ttr([w for w in x.split(' ')]))

    # lexical richness
    df_total['lex_words'] = df_total.text.apply(
        lambda x: LexicalRichness(x).words)
    df_total['lex_uniquewords'] = df_total.text.apply(
        lambda x: LexicalRichness(x).terms)
    df_total['lex_ttr'] = df_total.text.apply(
        lambda x: LexicalRichness(
            x).ttr)  # type token ratio : lexical richness

    # *********PSYCOLINGUISTIC FEATURES **********#

    # Sentiment score
    analyser = SentimentIntensityAnalyzer()
    df_total['sentiment_score'] = df_total.text.apply(
        lambda x: analyser.polarity_scores(x)['compound'])

    return df_total

Beispiel #13

0

Datei anzeigen

Datei: features.py Projekt: abigailmunsen/review-helpfulness-prediction

def unique_words(row):
    text = row['reviewText']
    lex = LexicalRichness(text)
    return lex.terms

Beispiel #14

0

Datei anzeigen

Datei: measure_complexity.py Projekt: CiaraG98/CompLingProject

    text_data = np.array(df.iloc[:, 3])
    pod_id = np.array(df.iloc[:, 0])
    speaker_id = np.array(df.iloc[:, 2])

    # measure complexity & record results in excel sheet
    instance_ids = []
    ttr = []
    mtld = []
    number_of_words = []
    readability = []
    unique_words = []
    avg_sentence_length = []
    avg_word_length = []
    for i, instance in enumerate(text_data):
        # lexical richness instance for entry before lemmatization and stopword removal
        lex_with_stopwords = LexicalRichness(instance)
        number_of_words.append(lex_with_stopwords.words)

        # mean sentence length
        mean_sentence_len = int(lex_with_stopwords.words /
                                textstat.sentence_count(instance))
        avg_sentence_length.append(mean_sentence_len)

        # mean word length
        num_chars = sum([len(w) for w in tokenizer.tokenize(instance)])
        mean_word_len = round(num_chars / lex_with_stopwords.words, 1)
        avg_word_length.append(mean_word_len)

        # readability
        readability.append(textstat.flesch_reading_ease(instance))

Beispiel #15

0

Datei anzeigen

Datei: feature_extraction.py Projekt: kherud/native-language-identification

def process_paper(file_path):
    with open(file_path, "r") as file:
        text = file.read()

    name = file_path.split("/")[-1].split(".")[0]
    start = detect_start(text, conferences[name]["title"])
    end = detect_end(text)

    text = char_re.sub("", text[start:end])
    doc = nlp(text)

    lr = LexicalRichness(text)

    document = {
        "n_chars": len(text),
        "n_tokens": 0,
        "n_sentences": 0,
        "word_lengths": {},
        "sentence_lengths": {},
        "sentence_tokens": {},
        "punctuation": {},
        "function_words": {},
        "tokens": [],
        "tags": [],
        "pos": [],
        "metrics": {}
    }

    for metric in [
            "cttr", "rttr", "ttr", "Dugast", "Herdan", "Maas", "Summer"
    ]:
        try:
            document["metrics"][metric] = getattr(lr, metric)
        except:
            document["metrics"][metric] = 0
    for metric in ["hdd", "mattr", "msttr", "mtld"]:
        try:
            document["metrics"][metric] = getattr(lr, metric)()
        except:
            document["metrics"][metric] = 0

    for sent in doc.sents:
        for token in sent:
            document["tokens"].append(token.lemma_)
            document["pos"].append(token.pos_)
            document["tags"].append(token.tag_)
            # document["tokens"].setdefault(token.lower(), 0)
            # document["tokens"][token.lower()] += 1
            document["word_lengths"].setdefault(len(token), 0)
            document["word_lengths"][len(token)] += 1
            document["n_tokens"] += 1
            if token.lemma_ in function_words:
                document["function_words"].setdefault(token.lemma_, 0)
                document["function_words"][token.lemma_] += 1
        document["sentence_lengths"].setdefault(len(sent.text), 0)
        document["sentence_lengths"][len(sent.text)] += 1
        document["sentence_tokens"].setdefault(len(sent), 0)
        document["sentence_tokens"][len(sent)] += 1
        document["n_sentences"] += 1

    for char in string.punctuation:
        document["punctuation"][char] = text.count(char)

    with open(f"../data/pkls/{name}.pkl", "wb") as file:
        pickle.dump(document, file)

Beispiel #16

0

Datei anzeigen

scores = {}

for result in cur.fetchall():
    # each result consists of and ideology name
    # as it's first element, and an array
    # of words associated with that ideology
    # as it's second element, like so:

    # ("communism", ["lots", "of", "communist", "words"])

    ideology = result[0]
    # only get up to smallest_wordlist_length for STTR formula
    words = result[1][:smallest_wordlist_length]

    words_big_string = " ".join(words)
    scores[ideology] = LexicalRichness(words_big_string).ttr

# Create chart save to disk

# Michael me va a regañar por abusar los comprehensions
# se lo juro que son apropriados aquí
x_names = [ideology.capitalize() for ideology in scores]
y_values = [score for ideology, score in scores.items()]
y_pos = np.arange(len(x_names))
# plt.style.use('dark_background')
plt.bar(y_pos, y_values, align='center', color="#330033", alpha=0.7)
plt.ylim(bottom=.27, top=.285)
plt.xticks(y_pos, x_names)
plt.ylabel('Type Token Ratio')
plt.title('Lexical Richness by Ideology')

Beispiel #17

0

Datei anzeigen

Datei: LexicalRichnessCalculator.py Projekt: LiatNativPersonal/Cognates

    def __init__(self):

        self.measures = {}
        self.lex = LexicalRichness("")

Beispiel #18

0

Datei anzeigen

def calc_ttr(text):
    try:
        return LexicalRichness(text).ttr
    except:
        return 0

Beispiel #19

0

Datei anzeigen

Datei: Project_yl4014_jh4019.py Projekt: connieliye/GroupHL

 def get_complexity(lyrics):
     score = LexicalRichness(lyrics)
     return 1 - score.ttr