def create_data_set(i): f = [] f_in = open("../Corpus/%s.txt" % i, "r") text = f_in.read() doc = nlp(text) sentences = [sent.string.strip() for sent in doc.sents] for s in sentences: print(s) features = {} sen = nlp(s) features["text"] = s #features["pos"] = pos.get_pos_tags(sen) #features["dep"] = dependency.get_dep_tags(sen) #features["ner"] = ner.get_ner(sen) features[ "closing_question_mark"] = sentence_closes_with_question_mark.sentence_closes_with_question_marks( sen) # bool features["contains_adverb"] = contains_adverb.contains_adverb( sen) # bool features["contains_modal"] = contains_modal_verb.contains_modal_verb( sen) # bool features[ "first_person"] = reference_to_first_person.contains_first_person( sen) # bool #features["causal_markers"] = argumentative_discourse_markers.contains_causal_markers(sen) # bool #features["conditional_markers"] = argumentative_discourse_markers.contains_conditional_markers(sen) # bool #features["adversative_markers"] = argumentative_discourse_markers.contains_adversative_markers(sen) # bool #features["consecutive_markers"] = argumentative_discourse_markers.contains_consecutive_markers(sen) # bool #features["concessive_markers"] = argumentative_discourse_markers.contains_concessive_markers(sen) # bool features[ "argumentative_discourse_markers"] = argumentative_discourse_markers.contains_argumentative_markers( sen) features["contains_named_entities"] = ner.contains_ner(sen) features["sentence_length"] = sentence_length.get_sentence_length(sen) f.append(features) #print(features) #print(f) df = pd.DataFrame.from_dict(f) X1 = cv.transform(df.text) # print(X1.toarray()) # print(cv.get_feature_names()) df = df.drop(columns='text') count_vect_df = pd.DataFrame(X1.todense(), columns=cv.get_feature_names()) # print(pd.concat([df, count_vect_df], axis=1)) combined_df = pd.concat([df, count_vect_df], axis=1) return combined_df
def create_data_set(text, cv): f = [] # read text #f_in = open("../Corpus/%s.txt" % i, "r") #text = f_in.read() doc = nlp(text) sentences = [sent.string.strip() for sent in doc.sents] # features for s in sentences: #print(s) features = {} sen = nlp(s) features["text"] = s features[ "closing_question_mark"] = sentence_closes_with_question_mark.sentence_closes_with_question_marks( sen) # bool features["contains_adverb"] = contains_adverb.contains_adverb( sen) # bool features["contains_modal"] = contains_modal_verb.contains_modal_verb( sen) # bool features[ "first_person"] = reference_to_first_person.contains_first_person( sen) # bool features[ "argumentative_discourse_markers"] = argumentative_discourse_markers.contains_argumentative_markers( sen) features["contains_named_entities"] = ner.contains_ner(sen) features["sentence_length"] = sentence_length.get_sentence_length(sen) f.append(features) df = pd.DataFrame.from_dict(f) X1 = cv.transform(df.text) df = df.drop(columns='text') count_vect_df = pd.DataFrame(X1.todense(), columns=cv.get_feature_names()) combined_df = pd.concat([df, count_vect_df], axis=1) return combined_df
def create_data_set(three_classes): f = [] for i in range(0, 990): f_in = open("../Corpus/%s.txt" % i, "r") text = f_in.read() f_ann = open("../Corpus/%s.ann" % i, "r") line = f_ann.readline() annotations = [] while line: l = line.split("\t") if l[0].startswith("T"): annotation = l[2] label = l[1].split(" ")[0] if not three_classes: if label in ('Claim', 'Premise'): label = 'Argumentative' annotations.append((annotation.strip(), label)) line = f_ann.readline() #for a in annotations: #print(a) doc = nlp(text) sentences = [sent.string.strip() for sent in doc.sents] for s in sentences: features = {} sen = nlp(s) features["label"] = get_label(s, annotations) features["text"] = s #features["pos"] = pos.get_pos_tags(sen) #features["dep"] = dependency.get_dep_tags(sen) #features["ner"] = ner.get_ner(sen) features[ "closing_question_mark"] = sentence_closes_with_question_mark.sentence_closes_with_question_marks( sen) # bool features["contains_adverb"] = contains_adverb.contains_adverb( sen) # bool features[ "contains_modal"] = contains_modal_verb.contains_modal_verb( sen) # bool features[ "first_person"] = reference_to_first_person.contains_first_person( sen) # bool #features["causal_markers"] = argumentative_discourse_markers.contains_causal_markers(sen) # bool #features["conditional_markers"] = argumentative_discourse_markers.contains_conditional_markers(sen) # bool #features["adversative_markers"] = argumentative_discourse_markers.contains_adversative_markers(sen) # bool #features["consecutive_markers"] = argumentative_discourse_markers.contains_consecutive_markers(sen) # bool #features["concessive_markers"] = argumentative_discourse_markers.contains_concessive_markers(sen) # bool features[ "argumentative_discourse_markers"] = argumentative_discourse_markers.contains_argumentative_markers( sen) features["contains_named_entities"] = ner.contains_ner(sen) features["sentence_length"] = sentence_length.get_sentence_length( sen) f.append(features) #print(features) #print(f) dataset = create_dataframe(f) return dataset