def parse_text_by_tagger(input_text):
    values = []
    grades = []
    first_person_pronoun = 0
    second_person_pronoun = 0
    third_person_pronoun = 0
    pronoun = 0
    finite_verb = 0
    modifier = 0
    past_tense = 0
    perf_aspect = 0
    present_tense = 0
    total_adverb = 0
    nominalization = 0
    all_nouns = 0
    genitive = 0
    neuter = 0
    passive = 0
    infin = 0
    speech_verb = 0
    mental_verb = 0
    that_complements = 0
    wh_relatives = 0
    total_pp = 0
    word_length = 0
    all_syllables = 0
    all_complex_words = 0
    complex_words = 0
    word_complexity = 0
    all_letters = 0
    all_words = 0
    text_span = 0
    sent_span = 0
    sentence_length = 0
    all_sent_words = 0
    all_sent_marks = 0
    type_token_ratio = 0
    all_types = set()
    all_tokens = 0
    verbal_adverb = 0
    passive_participial_clauses = 0
    active_participial_clauses = 0
    imperative = 0
    predicative_adjectives = 0
    attributive_adjective = 0
    causative_subordinate = 0
    concessive_subordinate = 0
    conditional_subordinate = 0
    purpose_subordinate = 0
    conditional_mood = 0
    modal_possibility = 0
    modal_necessity = 0
    evaluative_vocabulary = 0
    academic_vocabulary = 0
    parenthesis_attitude = 0
    animate = 0
    parenthesis_accentuation = 0
    parenthesis_relation= 0
    degree_advert = 0
    particles = 0
    numeral = 0
    top_100_nouns = 0
    non_top_100_nouns = 0
    nouns_minus_head = 0
    non_nouns_minus_head = 0
    top_100_verbs = 0
    non_top_100_verbs = 0
    verbs_minus_head = 0
    non_verbs_minus_head = 0
    top_100 = 0
    top_300 = 0
    top_500 = 0
    top_10000 = 0
    top_5000 = 0
    complex_endings = 0
    fsperson_verb = 0
    fk_grade = 0
    fk_grade_flex = 0
    cl_grade = 0
    smog_grage = 0
    dale_grade = 0
    ari_index = 0
    complexity_grade = 0


    tagged_text = rfttag(input_text)
    for tagged_sent in tagged_text:
        tagged_sent = [(el[0].lower(), el[1], el[2].lower()) for el in tagged_sent]

        first_person_pronoun += feature_extractors.first_person_pronoun(tagged_sent)
        second_person_pronoun += feature_extractors.second_person_pronoun(tagged_sent)
        # third_person_pronoun += feature_extractors.third_person_pronoun(tagged_sent)
        pronoun += feature_extractors.is_pronoun(tagged_sent)
        finite_verb += feature_extractors.is_finite_verb(tagged_sent)
        modifier += feature_extractors.is_modifier(tagged_sent)
        # past_tense += feature_extractors.past_tense(tagged_sent)
        # perf_aspect += feature_extractors.perf_aspect(tagged_sent)
        # present_tense += feature_extractors.present_tense(tagged_sent)
        total_adverb += feature_extractors.total_adverb(tagged_sent)
        (nomz, nouns) = feature_extractors.is_nominalization(tagged_sent)
        nominalization += nomz
        # all_nouns += nouns
        genitive += feature_extractors.is_genitive(tagged_sent)
        # neuter += feature_extractors.is_neuter(tagged_sent)
        passive += feature_extractors.is_passive(tagged_sent)
        # infin += feature_extractors.infinitives(tagged_sent)
        speech_verb += feature_extractors.speech_verb(tagged_sent)
        mental_verb += feature_extractors.mental_verb(tagged_sent)
        # that_complements += feature_extractors.that_complement(tagged_sent)
        # wh_relatives += feature_extractors.wh_relatives(tagged_sent)
        # total_pp += feature_extractors.total_PP(tagged_sent)
        (letters, words) = feature_extractors.word_length(tagged_sent)
        all_letters += letters
        all_words += words
        (syllables, complex_words) = feature_extractors.syllables(tagged_sent)
        all_syllables += syllables
        all_complex_words += complex_words
        sent_words = feature_extractors.sentence_length(tagged_sent)
        all_sent_words += sent_words
        all_sent_marks += 1
        sent_span += feature_extractors.text_span(tagged_sent)
        (types, tokens) = feature_extractors.type_token_ratio(tagged_sent)
        all_types = all_types.union(types)
        all_tokens += tokens
        verbal_adverb += feature_extractors.is_verbal_adverb(tagged_sent)
        passive_participial_clauses += feature_extractors.passive_participial_clauses(tagged_sent)
        active_participial_clauses += feature_extractors.active_participial_clauses(tagged_sent)
        imperative += feature_extractors.imperative_mood(tagged_sent)
        # predicative_adjectives += feature_extractors.predicative_adjectives(tagged_sent)
        # attributive_adjective += feature_extractors.attributive_adjective(tagged_sent)
        causative_subordinate += feature_extractors.causative_subordinate(tagged_sent)
        # concessive_subordinate += feature_extractors.concessive_subordinate(tagged_sent)
        # conditional_subordinate += feature_extractors.conditional_subordinate(tagged_sent)
        # purpose_subordinate += feature_extractors.purpose_subordinate(tagged_sent)
        # modal_possibility += feature_extractors.modal_possibility(tagged_sent)
        # modal_necessity += feature_extractors.modal_necessity(tagged_sent)
        # evaluative_vocabulary += feature_extractors.evaluative_vocabulary(tagged_sent)
        academic_vocabulary += feature_extractors.academic_vocabulary(tagged_sent)
        parenthesis_attitude += feature_extractors.parenthesis_attitude_evaluation(tagged_sent)
        # animate += feature_extractors.animate_nouns(tagged_sent)
        parenthesis_accentuation += feature_extractors.parenthesis_accentuation(tagged_sent)
        # parenthesis_relation += feature_extractors.parenthesis_relation(tagged_sent)
        degree_advert += feature_extractors.degree_adverb(tagged_sent)
        particles += feature_extractors.particles(tagged_sent)
        # numeral += feature_extractors.numeral(tagged_sent)
        # (t100nouns, non100nouns) = feature_extractors.top_100_nouns(tagged_sent)
        # top_100_nouns += t100nouns
        # non_top_100_nouns += non100nouns
        (t1000nouns, non1000nouns) = feature_extractors.top_1000_nouns_minus_head(tagged_sent)
        nouns_minus_head += t1000nouns
        # non_nouns_minus_head += non1000nouns
        (t100verbs, non100verbs) = feature_extractors.top_100_verbs(tagged_sent)
        top_100_verbs += t100verbs
        # non_top_100_verbs += non100verbs
        # (t1000verbs, non1000verbs) = feature_extractors.top_1000_verbs_minus_head(tagged_sent)
        # verbs_minus_head += t1000verbs
        # non_verbs_minus_head += non1000verbs
        # top_100 += feature_extractors.top_100(tagged_sent)
        # top_300 += feature_extractors.top_300(tagged_sent)
        # top_500 += feature_extractors.top_500(tagged_sent)
        # top_10000 += feature_extractors.top_10000(tagged_sent)
        top_5000 += feature_extractors.top_5000(tagged_sent)
        complex_endings += feature_extractors.complex_endings(tagged_sent)
        fsperson_verb += feature_extractors.is_12person_verb(tagged_sent)

    sentence_length = all_sent_words / all_sent_marks
    type_token_ratio = len(all_types) / all_tokens
    word_length = all_letters / all_words
    word_count = all_words
    word_complexity = all_syllables / all_words
    text_span = sent_span / all_sent_marks

# computing grades and indexes by formulas
    fk_grade = metrics.calc_Flesh_Kincaid_Grade_rus(all_syllables, word_count, all_sent_marks)
    fk_grade_flex = metrics.calc_Flesh_Kincaid_Grade_rus_flex(all_syllables, word_count, all_sent_marks)
    cl_grade = metrics.calc_Coleman_Liau_index(all_letters, word_count, all_sent_marks)
    smog_grage = metrics.calc_SMOG_index(all_complex_words, all_sent_marks)
    dale_grade = metrics.calc_Dale_Chale_index(all_complex_words, word_count, all_sent_marks)
    ari_index = metrics.calc_ARI_index(all_letters, word_count, all_sent_marks)
    complexity_grade = (fk_grade + fk_grade_flex + cl_grade + smog_grage + dale_grade + ari_index) / 6


    values.append(first_person_pronoun / word_count)
    values.append(second_person_pronoun / word_count)
    # values.append(third_person_pronoun / word_count)
    values.append(pronoun / word_count)
    values.append(finite_verb / word_count)
    values.append(modifier / word_count)
    # values.append(past_tense / word_count)
    # values.append(perf_aspect / word_count)
    # values.append(present_tense / word_count)
    values.append(total_adverb / word_count)
    values.append(nominalization / word_count)
    # values.append(all_nouns / word_count)
    values.append(genitive / word_count)
    # values.append(neuter / word_count)
    values.append(passive / word_count)
    # values.append(infin / word_count)
    values.append(speech_verb / word_count)
    values.append(mental_verb / word_count)
    # values.append(that_complements / word_count)
    # values.append(wh_relatives / word_count)
    # values.append(total_pp / word_count)
    # values.append(word_length)
    # values.append(word_complexity)
    values.append(text_span)
    values.append(sentence_length)
    values.append(type_token_ratio)
    values.append(verbal_adverb / word_count)
    values.append(passive_participial_clauses / word_count)
    values.append(active_participial_clauses / word_count)
    values.append(imperative / word_count)
    # values.append(predicative_adjectives / word_count)
    # values.append(attributive_adjective / word_count)
    values.append(causative_subordinate / word_count)
    # values.append(concessive_subordinate / word_count)
    # values.append(conditional_subordinate / word_count)
    # values.append(purpose_subordinate / word_count)
    # values.append(conditional_mood / word_count)
    # values.append(modal_possibility / word_count)
    # values.append(modal_necessity / word_count)
    # values.append(evaluative_vocabulary / word_count)
    values.append(academic_vocabulary / word_count)
    values.append(parenthesis_attitude / word_count)
    # values.append(animate / word_count)
    values.append(parenthesis_accentuation / word_count)
    # values.append(parenthesis_relation / word_count)
    values.append(degree_advert / word_count)
    values.append(particles / word_count)
    # values.append(numeral / word_count)
    # values.append(top_100_nouns / word_count)
    # values.append(non_top_100_nouns / word_count)
    values.append(nouns_minus_head / word_count)
    # values.append(non_nouns_minus_head / word_count)
    values.append(top_100_verbs / word_count)
    # values.append(non_top_100_verbs / word_count)
    # values.append(verbs_minus_head / word_count)
    # values.append(non_verbs_minus_head / word_count)
    # values.append(top_100 / word_count)
    # values.append(top_300 / word_count)
    # values.append(top_500 / word_count)
    # values.append(top_10000 / word_count)
    values.append(top_5000 / word_count)
    values.append(complex_endings / word_count)
    values.append(fsperson_verb / word_count)
    # values.append(fk_grade)
    # values.append(fk_grade_flex)
    # values.append(cl_grade)
    # values.append(smog_grage)
    # values.append(dale_grade)
    # values.append(ari_index)
    # values.append(complexity_grade)
    grades.append(fk_grade)
    grades.append(fk_grade_flex)
    grades.append(smog_grage)
    grades.append(cl_grade)
    grades.append(dale_grade)
    grades.append(ari_index)
    grades.append(complexity_grade)

    return values, grades
Example #2
0
def process_text(id, sents, vectorwriter):
    values = []
    first_person_pronoun_results = 0
    second_person_pronoun_results = 0
    third_person_pronoun_results = 0
    reflexive_pronoun_results = 0
    adjective_pronoun_results = 0
    nom_pronoun_results = 0
    indefinite_pron_results = 0
    past_tense_results = 0
    perf_aspect_results = 0
    present_tense_results = 0
    place_adverb_results = 0
    time_adverb_results = 0
    total_adverb_results = 0
    wh_questions_results = 0
    nominalization_results = 0
    nouns_results = 0
    passive_results = 0
    by_passive_results = 0
    infin_results = 0
    speech_verb_results = 0
    mental_verb_results = 0
    that_compl_results = 0
    wh_relative_results = 0
    pied_piping_results = 0
    total_PP_results = 0
    exclamation_results = 0
    word_length_results = 0
    all_letters = 0
    all_words = 0
    sentence_length_results = 0
    all_sent_words = 0
    all_sent_marks = 0
    type_token_ratio_results = 0
    all_types = set()
    all_tokens = 0
    verbal_adverb_results = 0
    passive_participial_clauses_results = 0
    active_participial_clauses_results = 0
    imperative_mood_results = 0
    predicative_adjectives_results = 0
    attributive_adjective_results = 0
    causative_subordinate_results = 0
    concessive_subordinate_results = 0
    conditional_subordinate_results = 0
    purpose_subordinate_results = 0
    negation_results = 0
    conditional_mood_results = 0
    modal_possibility_results = 0
    modal_necessity_results = 0
    evaluative_vocabulary_results = 0
    evidentiality_results = 0
    parenthesis_attitude_evaluation_results = 0
    animate_nouns_results = 0
    parenthesis_accentuation_results = 0
    parenthesis_relation_results = 0
    phrasal_coordination_results = 0
    other_coordination_results = 0
    degree_adverb_results = 0
    particles_results = 0
    time_nouns_results = 0
    quantity_nouns_results = 0
    causative_verb_results = 0
    numeral_results = 0
    existential_verb_results = 0
    change_verb_results = 0
    movement_verb_results = 0
    phisical_prop_adjective_results = 0
    time_adjective_results = 0
    size_adjective_results = 0

    for sent in sents:
        first_person_pronoun_results += feature_extractors.first_person_pronoun(sent)
        second_person_pronoun_results += feature_extractors.second_person_pronoun(sent)
        third_person_pronoun_results += feature_extractors.third_person_pronoun(sent)
        reflexive_pronoun_results += feature_extractors.reflexive_pronoun(sent)
        adjective_pronoun_results += feature_extractors.adjective_pronoun(sent)
        nom_pronoun_results += feature_extractors.nom_pronoun(sent)
        indefinite_pron_results += feature_extractors.indefinite_pron(sent)
        past_tense_results += feature_extractors.past_tense(sent)
        perf_aspect_results += feature_extractors.perf_aspect(sent)
        present_tense_results += feature_extractors.present_tense(sent)
        place_adverb_results += feature_extractors.place_adverb(sent)
        time_adverb_results += feature_extractors.time_adverb(sent)
        total_adverb_results += feature_extractors.total_adverb(sent)
        wh_questions_results += feature_extractors.wh_questions(sent)
        (nomz, nouns) = feature_extractors.is_nominalization(sent)
        nominalization_results += nomz
        nouns_results += nouns
        (passive, by_passive) = feature_extractors.is_agentless_passive(sent)
        passive_results += passive
        by_passive_results += by_passive
        infin_results += feature_extractors.infinitives(sent)
        speech_verb_results += feature_extractors.speech_verb(sent)
        mental_verb_results += feature_extractors.mental_verb(sent)
        that_compl_results += feature_extractors.that_complement(sent)
        (wh_rel, pied_pip) = feature_extractors.wh_relatives_and_pied_piping(sent)
        wh_relative_results += wh_rel
        pied_piping_results += pied_pip
        total_PP_results += feature_extractors.total_PP(sent)
        exclamation_results += feature_extractors.is_exclamation(sent)
        (letters, words) = feature_extractors.word_length(sent)
        all_letters += letters
        all_words += words
        sent_words = feature_extractors.sentence_length(sent)
        all_sent_words += sent_words
        all_sent_marks += 1
        (types, tokens) = feature_extractors.type_token_ratio(sent)
        all_types = all_types.union(types)
        all_tokens += tokens
        verbal_adverb_results += feature_extractors.is_verbal_adverb(sent)
        passive_participial_clauses_results += feature_extractors.passive_participial_clauses(sent)
        active_participial_clauses_results += feature_extractors.active_participial_clauses(sent)
        imperative_mood_results += feature_extractors.imperative_mood(sent)
        predicative_adjectives_results += feature_extractors.predicative_adjectives(sent)
        attributive_adjective_results += feature_extractors.attributive_adjective(sent)
        causative_subordinate_results += feature_extractors.causative_subordinate(sent)
        concessive_subordinate_results += feature_extractors.concessive_subordinate(sent)
        conditional_subordinate_results += feature_extractors.conditional_subordinate(sent)
        purpose_subordinate_results += feature_extractors.purpose_subordinate(sent)
        negation_results += feature_extractors.negation(sent)
        conditional_mood_results += feature_extractors.conditional_mood(sent)
        modal_possibility_results += feature_extractors.modal_possibility(sent)
        modal_necessity_results += feature_extractors.modal_necessity(sent)
        evaluative_vocabulary_results += feature_extractors.evaluative_vocabulary(sent)
        evidentiality_results += feature_extractors.evidentiality(sent)
        parenthesis_attitude_evaluation_results += feature_extractors.parenthesis_attitude_evaluation(sent)
        animate_nouns_results += feature_extractors.animate_nouns(sent)
        parenthesis_accentuation_results += feature_extractors.parenthesis_accentuation(sent)
        parenthesis_relation_results += feature_extractors.parenthesis_relation(sent)
        (phrasal, other) = feature_extractors.coordination(sent)
        phrasal_coordination_results += phrasal
        other_coordination_results += other
        degree_adverb_results += feature_extractors.degree_adverb(sent)
        particles_results += feature_extractors.particles(sent)
        time_nouns_results += feature_extractors.time_nouns(sent)
        quantity_nouns_results += feature_extractors.quantity_nouns(sent)
        causative_verb_results += feature_extractors.causative_verb(sent)
        numeral_results += feature_extractors.numeral(sent)
        existential_verb_results += feature_extractors.existential_verb(sent)
        change_verb_results += feature_extractors.change_verb(sent)
        movement_verb_results += feature_extractors.movement_verb(sent)
        phisical_prop_adjective_results += feature_extractors.phisical_prop_adjective(sent)
        time_adjective_results += feature_extractors.time_adjective(sent)
        size_adjective_results += feature_extractors.size_adjective(sent)

    sentence_length_results = all_sent_words / all_sent_marks
    type_token_ratio_results = len(all_types) / all_tokens
    word_length_results = all_letters / all_words
    word_count = all_words
    values.append(id)
    values.append(first_person_pronoun_results / word_count)
    values.append(second_person_pronoun_results / word_count)
    values.append(third_person_pronoun_results / word_count)
    values.append(reflexive_pronoun_results / word_count)
    values.append(adjective_pronoun_results / word_count)
    values.append(nom_pronoun_results / word_count)
    values.append(indefinite_pron_results / word_count)
    values.append(past_tense_results / word_count)
    values.append(perf_aspect_results / word_count)
    values.append(present_tense_results / word_count)
    values.append(place_adverb_results / word_count)
    values.append(time_adverb_results / word_count)
    values.append(total_adverb_results / word_count)
    values.append(wh_questions_results / word_count)
    values.append(nominalization_results / word_count)
    values.append(nouns_results / word_count)
    values.append(passive_results / word_count)
    values.append(by_passive_results / word_count)
    values.append(infin_results / word_count)
    values.append(speech_verb_results / word_count)
    values.append(mental_verb_results / word_count)
    values.append(that_compl_results / word_count)
    values.append(wh_relative_results / word_count)
    values.append(pied_piping_results / word_count)
    values.append(total_PP_results / word_count)
    values.append(exclamation_results / word_count)
    values.append(word_length_results)
    values.append(sentence_length_results)
    values.append(type_token_ratio_results)
    values.append(verbal_adverb_results / word_count)
    values.append(passive_participial_clauses_results / word_count)
    values.append(active_participial_clauses_results / word_count)
    values.append(imperative_mood_results / word_count)
    values.append(predicative_adjectives_results / word_count)
    values.append(attributive_adjective_results / word_count)
    values.append(causative_subordinate_results / word_count)
    values.append(concessive_subordinate_results / word_count)
    values.append(conditional_subordinate_results / word_count)
    values.append(purpose_subordinate_results / word_count)
    values.append(negation_results / word_count)
    values.append(conditional_mood_results / word_count)
    values.append(modal_possibility_results / word_count)
    values.append(modal_necessity_results / word_count)
    values.append(evaluative_vocabulary_results / word_count)
    values.append(evidentiality_results / word_count)
    values.append(parenthesis_attitude_evaluation_results / word_count)
    values.append(animate_nouns_results / word_count)
    values.append(parenthesis_accentuation_results / word_count)
    values.append(parenthesis_relation_results / word_count)
    values.append(phrasal_coordination_results / word_count)
    values.append(other_coordination_results / word_count)
    values.append(degree_adverb_results / word_count)
    values.append(particles_results / word_count)
    values.append(time_nouns_results / word_count)
    values.append(quantity_nouns_results / word_count)
    values.append(causative_verb_results / word_count)
    values.append(numeral_results / word_count)
    values.append(existential_verb_results / word_count)
    values.append(change_verb_results / word_count)
    values.append(movement_verb_results / word_count)
    values.append(phisical_prop_adjective_results / word_count)
    values.append(time_adjective_results / word_count)
    values.append(size_adjective_results / word_count)

    vectorwriter.writerow(values)