def parse_text_by_tagger(input_text): values = [] grades = [] first_person_pronoun = 0 second_person_pronoun = 0 third_person_pronoun = 0 pronoun = 0 finite_verb = 0 modifier = 0 past_tense = 0 perf_aspect = 0 present_tense = 0 total_adverb = 0 nominalization = 0 all_nouns = 0 genitive = 0 neuter = 0 passive = 0 infin = 0 speech_verb = 0 mental_verb = 0 that_complements = 0 wh_relatives = 0 total_pp = 0 word_length = 0 all_syllables = 0 all_complex_words = 0 complex_words = 0 word_complexity = 0 all_letters = 0 all_words = 0 text_span = 0 sent_span = 0 sentence_length = 0 all_sent_words = 0 all_sent_marks = 0 type_token_ratio = 0 all_types = set() all_tokens = 0 verbal_adverb = 0 passive_participial_clauses = 0 active_participial_clauses = 0 imperative = 0 predicative_adjectives = 0 attributive_adjective = 0 causative_subordinate = 0 concessive_subordinate = 0 conditional_subordinate = 0 purpose_subordinate = 0 conditional_mood = 0 modal_possibility = 0 modal_necessity = 0 evaluative_vocabulary = 0 academic_vocabulary = 0 parenthesis_attitude = 0 animate = 0 parenthesis_accentuation = 0 parenthesis_relation= 0 degree_advert = 0 particles = 0 numeral = 0 top_100_nouns = 0 non_top_100_nouns = 0 nouns_minus_head = 0 non_nouns_minus_head = 0 top_100_verbs = 0 non_top_100_verbs = 0 verbs_minus_head = 0 non_verbs_minus_head = 0 top_100 = 0 top_300 = 0 top_500 = 0 top_10000 = 0 top_5000 = 0 complex_endings = 0 fsperson_verb = 0 fk_grade = 0 fk_grade_flex = 0 cl_grade = 0 smog_grage = 0 dale_grade = 0 ari_index = 0 complexity_grade = 0 tagged_text = rfttag(input_text) for tagged_sent in tagged_text: tagged_sent = [(el[0].lower(), el[1], el[2].lower()) for el in tagged_sent] first_person_pronoun += feature_extractors.first_person_pronoun(tagged_sent) second_person_pronoun += feature_extractors.second_person_pronoun(tagged_sent) # third_person_pronoun += feature_extractors.third_person_pronoun(tagged_sent) pronoun += feature_extractors.is_pronoun(tagged_sent) finite_verb += feature_extractors.is_finite_verb(tagged_sent) modifier += feature_extractors.is_modifier(tagged_sent) # past_tense += feature_extractors.past_tense(tagged_sent) # perf_aspect += feature_extractors.perf_aspect(tagged_sent) # present_tense += feature_extractors.present_tense(tagged_sent) total_adverb += feature_extractors.total_adverb(tagged_sent) (nomz, nouns) = feature_extractors.is_nominalization(tagged_sent) nominalization += nomz # all_nouns += nouns genitive += feature_extractors.is_genitive(tagged_sent) # neuter += feature_extractors.is_neuter(tagged_sent) passive += feature_extractors.is_passive(tagged_sent) # infin += feature_extractors.infinitives(tagged_sent) speech_verb += feature_extractors.speech_verb(tagged_sent) mental_verb += feature_extractors.mental_verb(tagged_sent) # that_complements += feature_extractors.that_complement(tagged_sent) # wh_relatives += feature_extractors.wh_relatives(tagged_sent) # total_pp += feature_extractors.total_PP(tagged_sent) (letters, words) = feature_extractors.word_length(tagged_sent) all_letters += letters all_words += words (syllables, complex_words) = feature_extractors.syllables(tagged_sent) all_syllables += syllables all_complex_words += complex_words sent_words = feature_extractors.sentence_length(tagged_sent) all_sent_words += sent_words all_sent_marks += 1 sent_span += feature_extractors.text_span(tagged_sent) (types, tokens) = feature_extractors.type_token_ratio(tagged_sent) all_types = all_types.union(types) all_tokens += tokens verbal_adverb += feature_extractors.is_verbal_adverb(tagged_sent) passive_participial_clauses += feature_extractors.passive_participial_clauses(tagged_sent) active_participial_clauses += feature_extractors.active_participial_clauses(tagged_sent) imperative += feature_extractors.imperative_mood(tagged_sent) # predicative_adjectives += feature_extractors.predicative_adjectives(tagged_sent) # attributive_adjective += feature_extractors.attributive_adjective(tagged_sent) causative_subordinate += feature_extractors.causative_subordinate(tagged_sent) # concessive_subordinate += feature_extractors.concessive_subordinate(tagged_sent) # conditional_subordinate += feature_extractors.conditional_subordinate(tagged_sent) # purpose_subordinate += feature_extractors.purpose_subordinate(tagged_sent) # modal_possibility += feature_extractors.modal_possibility(tagged_sent) # modal_necessity += feature_extractors.modal_necessity(tagged_sent) # evaluative_vocabulary += feature_extractors.evaluative_vocabulary(tagged_sent) academic_vocabulary += feature_extractors.academic_vocabulary(tagged_sent) parenthesis_attitude += feature_extractors.parenthesis_attitude_evaluation(tagged_sent) # animate += feature_extractors.animate_nouns(tagged_sent) parenthesis_accentuation += feature_extractors.parenthesis_accentuation(tagged_sent) # parenthesis_relation += feature_extractors.parenthesis_relation(tagged_sent) degree_advert += feature_extractors.degree_adverb(tagged_sent) particles += feature_extractors.particles(tagged_sent) # numeral += feature_extractors.numeral(tagged_sent) # (t100nouns, non100nouns) = feature_extractors.top_100_nouns(tagged_sent) # top_100_nouns += t100nouns # non_top_100_nouns += non100nouns (t1000nouns, non1000nouns) = feature_extractors.top_1000_nouns_minus_head(tagged_sent) nouns_minus_head += t1000nouns # non_nouns_minus_head += non1000nouns (t100verbs, non100verbs) = feature_extractors.top_100_verbs(tagged_sent) top_100_verbs += t100verbs # non_top_100_verbs += non100verbs # (t1000verbs, non1000verbs) = feature_extractors.top_1000_verbs_minus_head(tagged_sent) # verbs_minus_head += t1000verbs # non_verbs_minus_head += non1000verbs # top_100 += feature_extractors.top_100(tagged_sent) # top_300 += feature_extractors.top_300(tagged_sent) # top_500 += feature_extractors.top_500(tagged_sent) # top_10000 += feature_extractors.top_10000(tagged_sent) top_5000 += feature_extractors.top_5000(tagged_sent) complex_endings += feature_extractors.complex_endings(tagged_sent) fsperson_verb += feature_extractors.is_12person_verb(tagged_sent) sentence_length = all_sent_words / all_sent_marks type_token_ratio = len(all_types) / all_tokens word_length = all_letters / all_words word_count = all_words word_complexity = all_syllables / all_words text_span = sent_span / all_sent_marks # computing grades and indexes by formulas fk_grade = metrics.calc_Flesh_Kincaid_Grade_rus(all_syllables, word_count, all_sent_marks) fk_grade_flex = metrics.calc_Flesh_Kincaid_Grade_rus_flex(all_syllables, word_count, all_sent_marks) cl_grade = metrics.calc_Coleman_Liau_index(all_letters, word_count, all_sent_marks) smog_grage = metrics.calc_SMOG_index(all_complex_words, all_sent_marks) dale_grade = metrics.calc_Dale_Chale_index(all_complex_words, word_count, all_sent_marks) ari_index = metrics.calc_ARI_index(all_letters, word_count, all_sent_marks) complexity_grade = (fk_grade + fk_grade_flex + cl_grade + smog_grage + dale_grade + ari_index) / 6 values.append(first_person_pronoun / word_count) values.append(second_person_pronoun / word_count) # values.append(third_person_pronoun / word_count) values.append(pronoun / word_count) values.append(finite_verb / word_count) values.append(modifier / word_count) # values.append(past_tense / word_count) # values.append(perf_aspect / word_count) # values.append(present_tense / word_count) values.append(total_adverb / word_count) values.append(nominalization / word_count) # values.append(all_nouns / word_count) values.append(genitive / word_count) # values.append(neuter / word_count) values.append(passive / word_count) # values.append(infin / word_count) values.append(speech_verb / word_count) values.append(mental_verb / word_count) # values.append(that_complements / word_count) # values.append(wh_relatives / word_count) # values.append(total_pp / word_count) # values.append(word_length) # values.append(word_complexity) values.append(text_span) values.append(sentence_length) values.append(type_token_ratio) values.append(verbal_adverb / word_count) values.append(passive_participial_clauses / word_count) values.append(active_participial_clauses / word_count) values.append(imperative / word_count) # values.append(predicative_adjectives / word_count) # values.append(attributive_adjective / word_count) values.append(causative_subordinate / word_count) # values.append(concessive_subordinate / word_count) # values.append(conditional_subordinate / word_count) # values.append(purpose_subordinate / word_count) # values.append(conditional_mood / word_count) # values.append(modal_possibility / word_count) # values.append(modal_necessity / word_count) # values.append(evaluative_vocabulary / word_count) values.append(academic_vocabulary / word_count) values.append(parenthesis_attitude / word_count) # values.append(animate / word_count) values.append(parenthesis_accentuation / word_count) # values.append(parenthesis_relation / word_count) values.append(degree_advert / word_count) values.append(particles / word_count) # values.append(numeral / word_count) # values.append(top_100_nouns / word_count) # values.append(non_top_100_nouns / word_count) values.append(nouns_minus_head / word_count) # values.append(non_nouns_minus_head / word_count) values.append(top_100_verbs / word_count) # values.append(non_top_100_verbs / word_count) # values.append(verbs_minus_head / word_count) # values.append(non_verbs_minus_head / word_count) # values.append(top_100 / word_count) # values.append(top_300 / word_count) # values.append(top_500 / word_count) # values.append(top_10000 / word_count) values.append(top_5000 / word_count) values.append(complex_endings / word_count) values.append(fsperson_verb / word_count) # values.append(fk_grade) # values.append(fk_grade_flex) # values.append(cl_grade) # values.append(smog_grage) # values.append(dale_grade) # values.append(ari_index) # values.append(complexity_grade) grades.append(fk_grade) grades.append(fk_grade_flex) grades.append(smog_grage) grades.append(cl_grade) grades.append(dale_grade) grades.append(ari_index) grades.append(complexity_grade) return values, grades
def process_text(id, sents, vectorwriter): values = [] first_person_pronoun_results = 0 second_person_pronoun_results = 0 third_person_pronoun_results = 0 reflexive_pronoun_results = 0 adjective_pronoun_results = 0 nom_pronoun_results = 0 indefinite_pron_results = 0 past_tense_results = 0 perf_aspect_results = 0 present_tense_results = 0 place_adverb_results = 0 time_adverb_results = 0 total_adverb_results = 0 wh_questions_results = 0 nominalization_results = 0 nouns_results = 0 passive_results = 0 by_passive_results = 0 infin_results = 0 speech_verb_results = 0 mental_verb_results = 0 that_compl_results = 0 wh_relative_results = 0 pied_piping_results = 0 total_PP_results = 0 exclamation_results = 0 word_length_results = 0 all_letters = 0 all_words = 0 sentence_length_results = 0 all_sent_words = 0 all_sent_marks = 0 type_token_ratio_results = 0 all_types = set() all_tokens = 0 verbal_adverb_results = 0 passive_participial_clauses_results = 0 active_participial_clauses_results = 0 imperative_mood_results = 0 predicative_adjectives_results = 0 attributive_adjective_results = 0 causative_subordinate_results = 0 concessive_subordinate_results = 0 conditional_subordinate_results = 0 purpose_subordinate_results = 0 negation_results = 0 conditional_mood_results = 0 modal_possibility_results = 0 modal_necessity_results = 0 evaluative_vocabulary_results = 0 evidentiality_results = 0 parenthesis_attitude_evaluation_results = 0 animate_nouns_results = 0 parenthesis_accentuation_results = 0 parenthesis_relation_results = 0 phrasal_coordination_results = 0 other_coordination_results = 0 degree_adverb_results = 0 particles_results = 0 time_nouns_results = 0 quantity_nouns_results = 0 causative_verb_results = 0 numeral_results = 0 existential_verb_results = 0 change_verb_results = 0 movement_verb_results = 0 phisical_prop_adjective_results = 0 time_adjective_results = 0 size_adjective_results = 0 for sent in sents: first_person_pronoun_results += feature_extractors.first_person_pronoun(sent) second_person_pronoun_results += feature_extractors.second_person_pronoun(sent) third_person_pronoun_results += feature_extractors.third_person_pronoun(sent) reflexive_pronoun_results += feature_extractors.reflexive_pronoun(sent) adjective_pronoun_results += feature_extractors.adjective_pronoun(sent) nom_pronoun_results += feature_extractors.nom_pronoun(sent) indefinite_pron_results += feature_extractors.indefinite_pron(sent) past_tense_results += feature_extractors.past_tense(sent) perf_aspect_results += feature_extractors.perf_aspect(sent) present_tense_results += feature_extractors.present_tense(sent) place_adverb_results += feature_extractors.place_adverb(sent) time_adverb_results += feature_extractors.time_adverb(sent) total_adverb_results += feature_extractors.total_adverb(sent) wh_questions_results += feature_extractors.wh_questions(sent) (nomz, nouns) = feature_extractors.is_nominalization(sent) nominalization_results += nomz nouns_results += nouns (passive, by_passive) = feature_extractors.is_agentless_passive(sent) passive_results += passive by_passive_results += by_passive infin_results += feature_extractors.infinitives(sent) speech_verb_results += feature_extractors.speech_verb(sent) mental_verb_results += feature_extractors.mental_verb(sent) that_compl_results += feature_extractors.that_complement(sent) (wh_rel, pied_pip) = feature_extractors.wh_relatives_and_pied_piping(sent) wh_relative_results += wh_rel pied_piping_results += pied_pip total_PP_results += feature_extractors.total_PP(sent) exclamation_results += feature_extractors.is_exclamation(sent) (letters, words) = feature_extractors.word_length(sent) all_letters += letters all_words += words sent_words = feature_extractors.sentence_length(sent) all_sent_words += sent_words all_sent_marks += 1 (types, tokens) = feature_extractors.type_token_ratio(sent) all_types = all_types.union(types) all_tokens += tokens verbal_adverb_results += feature_extractors.is_verbal_adverb(sent) passive_participial_clauses_results += feature_extractors.passive_participial_clauses(sent) active_participial_clauses_results += feature_extractors.active_participial_clauses(sent) imperative_mood_results += feature_extractors.imperative_mood(sent) predicative_adjectives_results += feature_extractors.predicative_adjectives(sent) attributive_adjective_results += feature_extractors.attributive_adjective(sent) causative_subordinate_results += feature_extractors.causative_subordinate(sent) concessive_subordinate_results += feature_extractors.concessive_subordinate(sent) conditional_subordinate_results += feature_extractors.conditional_subordinate(sent) purpose_subordinate_results += feature_extractors.purpose_subordinate(sent) negation_results += feature_extractors.negation(sent) conditional_mood_results += feature_extractors.conditional_mood(sent) modal_possibility_results += feature_extractors.modal_possibility(sent) modal_necessity_results += feature_extractors.modal_necessity(sent) evaluative_vocabulary_results += feature_extractors.evaluative_vocabulary(sent) evidentiality_results += feature_extractors.evidentiality(sent) parenthesis_attitude_evaluation_results += feature_extractors.parenthesis_attitude_evaluation(sent) animate_nouns_results += feature_extractors.animate_nouns(sent) parenthesis_accentuation_results += feature_extractors.parenthesis_accentuation(sent) parenthesis_relation_results += feature_extractors.parenthesis_relation(sent) (phrasal, other) = feature_extractors.coordination(sent) phrasal_coordination_results += phrasal other_coordination_results += other degree_adverb_results += feature_extractors.degree_adverb(sent) particles_results += feature_extractors.particles(sent) time_nouns_results += feature_extractors.time_nouns(sent) quantity_nouns_results += feature_extractors.quantity_nouns(sent) causative_verb_results += feature_extractors.causative_verb(sent) numeral_results += feature_extractors.numeral(sent) existential_verb_results += feature_extractors.existential_verb(sent) change_verb_results += feature_extractors.change_verb(sent) movement_verb_results += feature_extractors.movement_verb(sent) phisical_prop_adjective_results += feature_extractors.phisical_prop_adjective(sent) time_adjective_results += feature_extractors.time_adjective(sent) size_adjective_results += feature_extractors.size_adjective(sent) sentence_length_results = all_sent_words / all_sent_marks type_token_ratio_results = len(all_types) / all_tokens word_length_results = all_letters / all_words word_count = all_words values.append(id) values.append(first_person_pronoun_results / word_count) values.append(second_person_pronoun_results / word_count) values.append(third_person_pronoun_results / word_count) values.append(reflexive_pronoun_results / word_count) values.append(adjective_pronoun_results / word_count) values.append(nom_pronoun_results / word_count) values.append(indefinite_pron_results / word_count) values.append(past_tense_results / word_count) values.append(perf_aspect_results / word_count) values.append(present_tense_results / word_count) values.append(place_adverb_results / word_count) values.append(time_adverb_results / word_count) values.append(total_adverb_results / word_count) values.append(wh_questions_results / word_count) values.append(nominalization_results / word_count) values.append(nouns_results / word_count) values.append(passive_results / word_count) values.append(by_passive_results / word_count) values.append(infin_results / word_count) values.append(speech_verb_results / word_count) values.append(mental_verb_results / word_count) values.append(that_compl_results / word_count) values.append(wh_relative_results / word_count) values.append(pied_piping_results / word_count) values.append(total_PP_results / word_count) values.append(exclamation_results / word_count) values.append(word_length_results) values.append(sentence_length_results) values.append(type_token_ratio_results) values.append(verbal_adverb_results / word_count) values.append(passive_participial_clauses_results / word_count) values.append(active_participial_clauses_results / word_count) values.append(imperative_mood_results / word_count) values.append(predicative_adjectives_results / word_count) values.append(attributive_adjective_results / word_count) values.append(causative_subordinate_results / word_count) values.append(concessive_subordinate_results / word_count) values.append(conditional_subordinate_results / word_count) values.append(purpose_subordinate_results / word_count) values.append(negation_results / word_count) values.append(conditional_mood_results / word_count) values.append(modal_possibility_results / word_count) values.append(modal_necessity_results / word_count) values.append(evaluative_vocabulary_results / word_count) values.append(evidentiality_results / word_count) values.append(parenthesis_attitude_evaluation_results / word_count) values.append(animate_nouns_results / word_count) values.append(parenthesis_accentuation_results / word_count) values.append(parenthesis_relation_results / word_count) values.append(phrasal_coordination_results / word_count) values.append(other_coordination_results / word_count) values.append(degree_adverb_results / word_count) values.append(particles_results / word_count) values.append(time_nouns_results / word_count) values.append(quantity_nouns_results / word_count) values.append(causative_verb_results / word_count) values.append(numeral_results / word_count) values.append(existential_verb_results / word_count) values.append(change_verb_results / word_count) values.append(movement_verb_results / word_count) values.append(phisical_prop_adjective_results / word_count) values.append(time_adjective_results / word_count) values.append(size_adjective_results / word_count) vectorwriter.writerow(values)