Python NERの例、NER Pythonの例

コード例 #1

0

ファイルを表示

def send(web):

    #function returns list of noun pharse sentence
    content = content_extract(web)

    n = NER.data(content)
    return n

コード例 #2

0

ファイルを表示

ファイル: main.py プロジェクト: calmzealA/Resume-Parser

def main():
    text = DocToText("test.doc")

    x = NER.chunk_NER(text)
    print x

    info = regex_info_extractor(text)

コード例 #3

0

ファイルを表示

def q3(query, k=3):
    """
    Returns the top k entities associated with a query.

    Parameters
    ----------
    query: String
        The query to be passed to obtain documents.
    k: int
        The number of top entities to be returned.

    Returns
    -------
    List of strings
    """
    if k is None:
        k = 3
    else:
        k = int(k)
    with open("data/search_engine_data.txt", "rb") as f:
        s_e = pickle.load(f)
    query_tokens = s_e.filtered_tokenize(query)

    ids = s_e.get_matches_OR(query_tokens)

    # raises an exception if the query is not found in any of the documents
    if len(ids) == 0:
        return None
        # raise Exception("Sorry, no matches were found with the query \' " + q + " \' ")
    tokens = [s_e.unfiltered_tokens[id] for id in ids]
    return NER.common_entities(tokens, k=k)

コード例 #4

0

ファイルを表示

ファイル: preprocess_ag_data.py プロジェクト: piesauce/privacy_hidden_representations

def gen_examples():
    """
    Generate train, dev, test examples from data.
    Extract news articles only belonging to topics 'World', 'Entertainment', 'Sports', and 'Business'.
    Retain only those examples that contain an instance of one of the top 5 most frequent named entities.
    """
    examples = []
    temp = []
    categories = ['World', 'Entertainment', 'Sports', 'Business']
    cat_label = {x: i for i, x in enumerate(categories)}
    tree_root = get_data()

    for child in tree_root:
        if child.tag == 'title' or child.tag == 'category' or child.tag == 'description':
            temp.append(child.text)
        if child.tag == 'pubdate':
            if len(temp) == 3:
                if temp[1] in categories:
                    if temp[0] is not None and temp[2] is not None:
                        X = temp[0] + " " + temp[2]
                        X_processed = unescape_chars(X)
                        Y = cat_label[temp[1]]
                        ex = Example(X_processed, Y)
                        examples.append(ex)
            temp = []
    new_examples = NER.ne_extract(examples, top=5)
    train, dev, test = train_test_split(new_examples)
    return train, dev, test

コード例 #5

0

ファイルを表示

ファイル: main.py プロジェクト: krishh-konar/Resume-Parser

def main():
	text = DocToText("test.doc")

	x = NER.chunk_NER(text)
	print x
	
	info = regex_info_extractor(text)

コード例 #6

0

ファイルを表示

ファイル: validate.py プロジェクト: gsantia/context-sensitive-NER

def crossValidate(n=10,
                  multi=True,
                  threshold=[0.262, 0.880],
                  NEtype="location"):
    """perform n-fold cross validation on the data, in order to optimize
    the threshold parameters and maximize F1 score for the various models."""

    training_2016 = '../data/2016/data/train'
    training_2017 = '../data/2017/wnut17train.conll'

    data_split = split_data(training_2017, n)
    result = defaultdict(defaultdict)

    #load in the _exdeflike_ and _indeflike_ data
    with open('exdeflike.txt', 'rb') as infile:
        total_exdef = pickle.load(infile)

    with open('indeflike.txt', 'rb') as f:
        total_indef = pickle.load(f)

    exdeflikes = [None] * n
    indeflikes = [None] * n

    for fold in total_exdef:
        exdeflikes[int(fold)] = total_exdef[fold]
        indeflikes[int(fold)] = total_indef[fold]
        #put the indefs and exdefs in lists with the same indices

    for i, data in enumerate(data_split):
        #need to perform testing n times, giving each of the n partitions
        #of the data a chance to be the test data and the rest training
        duplicate = list(data_split)  #don't want to modify the original list
        test = duplicate.pop(i)
        #we split up runTest and evaluate
        print "performing " + str(i) + " fold"
        testData = NER.runTest(test,
                               exdeflikes[i],
                               indeflikes[i],
                               multi=multi,
                               NEtype=NEtype)
        result[str(i)] = NER.evaluate(testData, threshold, NEtype=NEtype)
    sort_result = sortDict(result)

    with open('2017plus_' + NEtype + '.result', 'w') as outfile:
        json.dump(sort_result, outfile, sort_keys=True, indent=4)
    return sort_result

コード例 #7

0

ファイルを表示

ファイル: Locations.py プロジェクト: Jrhenderson11/NLP

def filter_basic(candidates):
    import NER
    matches = []
    for word in candidates:
        if re.match("([A-Z]\w+ )+[0-9]\w+", word):
            matches.append(word)

    return NER.filter_part_dates(matches)

コード例 #8

0

ファイルを表示

 def __init__(self):
     self.questions = None
     self.predictions = []
     self.para_searcher = paragraphSearch.ParaSearcher()
     self.sent_searcher = paragraphSearch.SentSearcher()
     self.classifier = questionClassifier.QuestionClassfier()
     self.ner_tagger = NER.NERTagger()
     self.extracter = answerExtracter.AnswerExtracter()
     self.cached_doc = []

コード例 #9

0

ファイルを表示

def evaluate_speakers(fileName, speakers, notspeakers, acnames):

	text = Code.get_text(fileName)
	#acnames = NER.extract_tagged_names(text)

	fp = 0.0
	fn = 0.0
	tn = 0.0
	tp = 0.0
	
	for name in acnames:
		if name in speakers:
			tp = tp + 1
		else:
			fn = fn + 1
	for name in speakers:
		#print name
		#ADJUSTED
		if not name in acnames:
			found = False
			for acname in acnames:
				if NER.contains_part_name(name, acname):
					found =True
					break
			if not found:
				fp = fp + 1

	for name in notspeakers:
		if not name in acnames:
			tn = tn + 1

	acc=0
	if (tp + tn + fp + fn) >0:
		acc = (tp + tn) / (tp + tn + fp + fn)
	#if both are empty we have 100% accuracy
	if acnames == speakers and acnames == []:
		acc = 1
	
	precision = 0
	if len(speakers) > 0:
		precision = tp / len(speakers)

	recall = 0
	if len(acnames) >0:
		recall = tp / len(acnames)

	if (precision + recall == 0):
		f1 = 0
	else:
		f1 = 2*(precision * recall  /(precision + recall))
	
	print "evaluation of file " + fileName
	print "accuracy: " + str(acc)
	print "precision: " + str(precision)
	print "recall: " + str(recall)
	print "f1: " + str(f1) 
	return (acc, precision, recall, f1)

コード例 #10

0

ファイルを表示

ファイル: validate.py プロジェクト: gsantia/context-sensitive-NER

def loadExtra():
    """this function will output the list _records_ of the supplementary
    internal model named entity data that we are using. here we will also
    pick and choose which of these files to apply the acronymization to"""

    raw_records = []
    #create a list of all the filenames in the lexicon data folder
    filenames = glob.glob('../data/lexicon/*.*')

    tuple_data = []
    for filename in filenames:
        with open(filename, 'r') as f:
            records = re.split("\n", f.read().strip())
            for record in records:
                data = record.split()
                sublist = []
                for i, token in enumerate(data):
                    if not i:
                        #if the word is first, it must have tag B
                        sublist.append((token, "B"))
                    else:
                        sublist.append((token, "I"))
                tuple_data.append(sublist)

    #now we load the add in the acronym data
    acronyms = glob.glob('../data/acronyms/*.*.*')

    for acronym in acronyms:
        with open(acronym, 'r') as filename:
            records = re.split("\n", filename.read().strip())
            for record in records:
                sublist = []
                if record:
                    token = record.split()[-1]  #we just want the acronym
                    sublist.append((token, "B"))  #always a single word
                tuple_data.append(sublist)

    #train the write the data to disk
    NER.trainExtra(tuple_data)

コード例 #11

0

ファイルを表示

def sharednams():
    print("abc".upper() == "ABC")
    print NER.filtermonths(["jan", "Jan", "January", "word"])

    names1 = NER.print_names_file("/home/james/Desktop/India.txt")

    names2 = NER.print_names_file('/home/james/Desktop/Tintin.txt')
    names11 = []
    names22 = []

    for name in names1:
        for part in name.split(" "):
            names11.append(part)

    for name in names2:
        for part in name.split(" "):
            names22.append(part)

    both = set(names11) & set(names22)

    print "Common names:"
    for x in both:
        print x

コード例 #12

0

ファイルを表示

ファイル: Locations.py プロジェクト: Jrhenderson11/NLP

def has_building_words(candiates):
    import NER
    #so happy someone spells centre correctly! now I can add it to my list of building words and its useful (email 284 btw)
    building_words = [
        "room", "building", "hall", "auditorium", "wing", "floor", "center",
        "centre", "school", "theater", "theatre", "library", "university",
        "tower", "college", "institute", "avenue"
    ]
    #add capitalised versions as well
    building_words.extend(NER.name_capitalise(building_words))
    contain = []
    for line in candiates:
        for word in building_words:
            if word in line:
                contain.append(line)
                break
    return contain

コード例 #13

0

ファイルを表示

def q2(ent, k=3):
    """
    Returns the top k entities associated with ent.

    Parameters
    ----------
    ent: String
        The entity to be passed as a query.
    k: int
        The number of top entities to be returned.

    Returns
    -------
    List of strings
    """
    if k is None:
        k = 3
    else:
        k = int(k)
    return NER.top_related(ent.lower(), k=k)

コード例 #14

0

ファイルを表示

ファイル: Locations.py プロジェクト: Jrhenderson11/NLP

def get_all_locations(text):
    firstcandidates = []
    candidates = []
    final = []
    basics = []

    firstlist = re.findall(
        r'(((([A-Z]\w+)|1st|2nd|3rd|4th|5th|6th|(r|R)o(o|m)m|[0-9]+[A-Z]*)((,? (((r|R)o(o|m)m|in) )?)(([A-Z]\w+|1st|2nd|3rd|4th|5th|6th|hall|Hall|[0-9]+[A-Z]*)))+)(\n\s?((([A-Z]\w+)|1st|2nd|3rd|4th|5th|6th|(r|R)o(o|m)m|[0-9]+[A-Z]*)((,? (((r|R)o(o|m)m|in) )?)(([A-Z]\w+|1st|2nd|3rd|4th|5th|6th|hall|Hall|[0-9]+[A-Z]*)))*))*)',
        text, re.MULTILINE)

    bas = (re.findall("(([A-Z]\w+ )+[0-9]\w+)", text))
    if not bas == []:
        for match in bas:
            basics.append(match[0].strip())

    final.extend(NER.filter_part_dates(basics))

    for match in firstlist:
        #print(match[0])
        #raw_input()
        firstcandidates.append(match[0].strip())

    #strip one word line off
    for candidate in firstcandidates:
        #print(candidate)
        #print(re.findall(r'\n\s?\w+(\s|\n)?$', candidate))
        #raw_input()
        if (re.findall(r'\n\s?\w+(\s|\n)?$', candidate) == ['']):
            candidates.append(re.sub(r'\n\s?\w+(\s|\n)?', "", candidate))
        else:
            candidates.append(candidate)

    for match in has_building_words(candidates):
        final.append(match.strip())
    for match in filter_basic(candidates):
        final.append(match.strip())

    #final.extend(place_field(text))

    return set(final)

コード例 #15

0

ファイルを表示

 def process_block(self, text, terms, block_type, term_doc_count_dict,
                   total_length, silver_query):
     pos_tags = pos.get_pos_tags(terms)
     entity_words = set()
     if self.use_ner:
         entity_words = ner.get_entities(text)
     size = 10
     # prev_prev_features = [0] * size
     prev_features = [0] * size
     next_features = [0] * size
     # nex_next_features = [0] * size
     for i, (term, pos_tag) in enumerate(zip(terms, pos_tags)):
         features = self.process_word(i, term, block_type, pos_tag,
                                      entity_words, term_doc_count_dict,
                                      total_length)
         # if i > 1:
         #     prev_prev_features = self.process_word(i-2, terms[i-2], block_type, pos_tags[i-2], entity_words, term_doc_count_dict, total_length)
         if i > 0:
             prev_features = self.process_word(i - 1, terms[i - 1],
                                               block_type, pos_tags[i - 1],
                                               entity_words,
                                               term_doc_count_dict,
                                               total_length)
         if i < len(terms) - 1:
             next_features = self.process_word(i + 1, terms[i + 1],
                                               block_type, pos_tags[i + 1],
                                               entity_words,
                                               term_doc_count_dict,
                                               total_length)
         # if i < len(terms) - 2:
         #     nex_next_features = self.process_word(i+2, terms[i+2], block_type, pos_tags[i+2], entity_words, term_doc_count_dict, total_length)
         if self.useContext:
             features = prev_features + features + next_features
         is_in_doc = int(
             term in anserini.tokenizeString(silver_query, 'lucene'))
         self.add_sample(is_in_doc, features)

コード例 #16

0

ファイルを表示

def clear_database():
    NER.clear_database()

コード例 #17

0

ファイルを表示

ファイル: qa.py プロジェクト: AnirudhNarasimhamurthy/Natural-Language-Processing-Fall-2015

    print 'NER List:', NER_list"""

    master_person_list = []
    master_org_list = []
    master_loc_list = []
    master_month_list = []
    master_time_list = []
    master_money_list = []
    master_percent_list = []
    master_prof_list = []

    for i in range(0, len(sentences_list)):
        sentences_list[i] = sentences_list[i].replace(",", "").replace(".", "").replace("!", "")
        # sentences_list[i]=sentences_list[i].replace("'",'"')
        sent_person_list, sent_org_list, sent_loc_list, sent_month_list, sent_time_list, sent_money_list, sent_percent_list, sent_prof_list = NER.named_entity_recognition(
            sentences_list[i]
        )
        master_person_list.append(sent_person_list)
        master_org_list.append(sent_org_list)
        master_loc_list.append(sent_loc_list)
        master_month_list.append(sent_month_list)  # month and weekday names + season names
        master_time_list.append(sent_time_list)
        master_money_list.append(sent_money_list)
        master_percent_list.append(sent_percent_list)
        master_prof_list.append(sent_prof_list)

    ################### CATEGORIZING THE QUESTION AS WH0, WHAT, WHEN , WHY , WHERE OR HOW  ########################

    who_list, what_list, when_list, why_list, where_list, how_list = [], [], [], [], [], []

    for i in range(0, len(cleansedqList)):

コード例 #18

0

ファイルを表示

ファイル: splitStatements.py プロジェクト: aboSamoor/lydia

def addNewlines(fin, fout):
    fin = os.path.abspath(fin)
    fout = os.path.abspath(fout)
    text = NER.getText(fin)
    txt = re.sub(r"(^[.|?|!].*)",'\\1\n',text)
    NER.writeText(fout, txt)

コード例 #19

0

ファイルを表示

def get_data():
    NER.db = NER.retrieve_database()
    return NER.db

コード例 #20

0

ファイルを表示

ファイル: who.py プロジェクト: young1205/Natural-Language-Processing-Fall-2015

def answering_who(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_person_list,sent_prof_list):

    # Declaring globals to be used in this function

    sent_score_list=[]
    q_verblist=[]


    stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from',
                          'has','have','he','in','is','it','its','of','on','that','the',
                          'to','was','were','will','with']

    temp_q=cleansedQuestion
    #temp_q=temp_q.replace('"','')
    #temp_q=temp_q.replace("'",'"')
    temp_q=temp_q.replace('?','')

    lmtzr=WordNetLemmatizer()
    pos_list= POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v'))



    #print 'Temp_q: ',temp_q

    q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q)

    for i in range(0, len(complete_sentence_list)):
        #print 'Sentence is :', complete_sentence_list[i]
        score=0

        # 1. Score using word match rule. Match words in question with the words in stop free sentence

        #print 'Sentence is :',sentence_list[i]
        score=score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])

        # 2. If question does not contain name but the answer contains NAME then you are confident(+6)
        if q_person_list==[]:

            #Giving more weights to sentences having more names in it
            if sent_person_list[i] !=[] or sent_prof_list[i] !=[]:
                #score=score + 6*len(sent_person_list) + 6* len(sent_prof_list)
                score=score + 6

            # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4)
            lmtzr = WordNetLemmatizer()
            temp= complete_sentence_list[i].split()
            for k in range(0,len(temp)):
                if lmtzr.lemmatize(temp[k].lower())=='name':
                    score=score + 4

            #  4. Awards points to all sentences  that contain a name or reference to a human

            if sent_person_list[i] !=[] or sent_prof_list[i] !=[]:
                #score=score + 4*len(sent_person_list) + 4* len(sent_prof_list)
                score=score+4


        # 5. If the answer contains the exact verb found in the question after the "Who" or in fact in the whole question
        # then it is a confident clue and we reward it more

        sent_pos_list= POS_Tagging.pos_tagging(complete_sentence_list[i])

        '''for m in range(0, len(sent_pos_list)):
            if sent_pos_list[m][1] in ['VB','VBD','VBN','VBG','VBZ'] and sent_pos_list[m][0] in stop_words_free_question.split():
                score=score + 18
                #print 'Score now is :', score'''

        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score=score + 6



        # 6. If the question contains a profession name, the answer has to be a person and sentence would have
        #the person name and the profession

        if q_prof_list!=[]:
            for k in complete_sentence_list[i].split():
                if k.lower() in q_prof_list:
                    #print 'Profession Yes !'
                    score=score+18

        else:  #Question contains name so the chances of answer being a profession name are decent
            if sent_prof_list[i] !=[]:
                score=score+6


        sent_score_list.append(score)

    #print 'Sent score list is :',sent_score_list


    # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first


    candidate_list=[]
    npfinal_list=[]
    temp_list=[]
    answer_list=[]

    max_score_value=max(sent_score_list)

    #print 'Max score is :',max_score_value

    for i in range(0, len(complete_sentence_list)):
        if sent_score_list[i]==max_score_value:
            candidate_list.append((complete_sentence_list[i],i))
    #print 'Candidate list is :',candidate_list


    #If there is only one sentence, then choose the sentence and then do the processing to display the answer

    if len(candidate_list)==1:

        temp_str= candidate_list[0][0]
        index=candidate_list[0][1]
        #Cleaning up the candidate sentence
        # Replacing double quotes with blank and single quotes with "
        #temp_str=temp_str.replace('"','')
        #temp_str=temp_str.replace("'",'"')
        #temp_str=temp_str.replace(',','').replace('?','').replace('!','')


    # If there are multiple candidates, then choose the sentence which appeared first in the story  and then do the processing
    else:
        # There are more than one candidate sentences. Print the first sentence
        for k in range(0, len(candidate_list)):

            #Cleaning up the candidate sentence

            temp_str=candidate_list[k][0]
            index =candidate_list[k][1]
            #temp_str=temp_str.replace('"','')
            #temp_str=temp_str.replace("'",'"')
            #temp_str=temp_str.replace(',','').replace('?','').replace('!','')


            break

    ####################### SENTENCE PROCESSING TO FIND THE ANSWER ###############################

    #Just pick out the noun-phrase or PERSON names from the sentence

    #s_plist,s_orglist,s_loclist,s_monthlist,s_timelist,s_moneylist,s_percentlist,s_proflist=NER.named_entity_recognition(temp_str)
    s_plist=sent_person_list[index]
    s_proflist=sent_prof_list[index]

    #print 'Prof list is:',s_proflist

    #If the question has a name of person, then the answer sentence should/would most probably
    #the name of a person but it should not be the name of the person appearing in the question.
    #If we can't find any other name in the candidate sentence then we do POS tagging and display the NOUN phrases

    #print 'Question person list is:',q_person_list
    #print 'Sentence person list is:',s_plist

    result_list=[]
    q_loc_who_list=[]

    if q_person_list==[] and s_plist==[]:   #If both the question does not have a name and the sentence does not have a name,print the whole sentence minus words which appear in question

        '''pos_np_list= POS_Tagging.pos_noun_tagging(temp_str)
        if pos_np_list != []:
            for x in pos_np_list:
                if x not in temp_q and x[0].isupper():   #Noun phrases or names generally start with an upper case character
                    print 'First character caps',x
                    result_list.append(x)
            return ' '.join(result_list)'''

        for k in temp_str.split():
            if k not in temp_q:
                result_list.append(k)

        return ' '.join(result_list)

    elif q_person_list !=[] and s_plist !=[]:    #To counter situations when both question and sentence has names Ex. Who defeated who ?
        for k in s_plist:
            if k not in temp_q:
                answer_list.append(k)


    elif q_person_list==[] and s_plist !=[]:
        for i in range(0, len(s_plist)):
            if s_plist[i] not in q_person_list and s_plist[i] not in temp_q:  #To counter situations where question has a name and NER doesn't identify it
                answer_list.append(s_plist[i])


    elif q_person_list != [] and s_proflist !=[]:  #To counter situations for 'Who is X' type questions which could have a profession name in the answer
        for k in s_proflist:
            answer_list.append(k)

    elif q_person_list==[] and q_loc_list !=[]: # Who is <X> where ?
        #print 'Question has no name but has a location'
        for k in temp_str.split():
            if k not in temp_q:
                q_loc_who_list.append(k)
        if q_loc_who_list !=[]:
            return ' '.join(q_loc_who_list)

    '''elif q_person_list==[] and s_proflist !=[]:
        for k in s_proflist:
            answer_list.append(k)'''

    if answer_list != [] :#and flag==1:                #Indicating candidate sentence has a name other than that in question
        result= ' '.join(answer_list)
    else:

        #Pick out the noun phrase or nouns and then display them as answer

        np_list = POS_Tagging.pos_noun_tagging(temp_str)
        for x in np_list :
            if x not in temp_q:
                npfinal_list.append(x) #Removing all occurences of existing noun phrases from the question


        #print 'NP Final list after removal is',npfinal_list
        if npfinal_list !=[]:
            result=' '.join(npfinal_list)

        else:
            result=temp_str                  # Printing out the whole sentence

    #print 'Result is:',result
    return result

コード例 #21

0

ファイルを表示

ファイル: qa.py プロジェクト: SAISRIHARSHAS/CS5560_Sai_LabsOnly_Repository

    master_money_list = []
    master_percent_list = []
    master_prof_list = []

    #print 'Sentence list is:',sentences_list

    for i in range(0, len(sentences_list)):
        temp_str = sentences_list[i]
        '''sentences_list[i]=sentences_list[i].strip()
        if sentences_list[i].index(',') != -1:
            if sentences_list[i][sentences_list[i].index(',')+1]!=' ':
                sentences_list[i]=sentences_list[i].replace(',','').replace('!','')'''
        temp_str = temp_str.strip()
        temp_str = temp_str.replace(',', '').replace('!', '')
        #sentences_list[i]=sentences_list[i].replace("'",'"')
        sent_person_list, sent_org_list, sent_loc_list, sent_month_list, sent_time_list, sent_money_list, sent_percent_list, sent_prof_list = NER.named_entity_recognition(
            temp_str)
        master_person_list.append(sent_person_list)
        master_org_list.append(sent_org_list)
        master_loc_list.append(sent_loc_list)
        master_month_list.append(
            sent_month_list)  #month and weekday names + season names
        master_time_list.append(sent_time_list)
        master_money_list.append(sent_money_list)
        master_percent_list.append(sent_percent_list)
        master_prof_list.append(sent_prof_list)

    ################### CATEGORIZING THE QUESTION AS WH0, WHAT, WHEN , WHY , WHERE OR HOW  ########################

    who_list,what_list,when_list,why_list,where_list,how_list=[],[],[],[],[],[]

    for i in range(0, len(cleansedqList)):

コード例 #22

0

ファイルを表示

ファイル: bw2utf8.py プロジェクト: aboSamoor/lydia

#! /usr/bin/env python

import NER
import sys
import os

if __name__=="__main__":
    fin = os.path.abspath(sys.argv[1])
    fout = os.path.abspath(sys.argv[2])
    NER.bwtoutf8(fin,fout)

コード例 #23

0

ファイルを表示

ファイル: Code.py プロジェクト: Jrhenderson11/NLP

def part_one(fileList, myList):
	#paragraph scores
	totPacc, totPprec, totPrec, totPf = 0, 0, 0, 0
	#sentence scores
	totSacc, totSprec, totSrec, totSf = 0, 0, 0, 0
	#time scores
	totTacc, totTprec, totTrec, totTf = 0, 0, 0, 0
	#name scores
	totNacc, totNprec, totNrec, totNf = 0, 0, 0, 0
	#location scores
	totLacc, totLprec, totLrec, totLf = 0, 0, 0, 0
	
	docs =[]
	classes = set()

	#because NER requires training a tagger it is best done as a 
	#batch process rather than repeatedly calling a function, 
	#therefore it is done here and the results processed later
	if (len(fileList) > 10):
		print("running NER on " + str(len(fileList)) + " files (might take a bit)")
	else:
		print("running NER")

	namesdict = NER.extract_names_files(fileList)
	numdict = {}

	for i in range (0, len(fileList)):
		untaggedfile = fileList[i]
		taggedfile = fileList[i].replace("un", "")

		text = get_text(untaggedfile)
		taggedtext = get_text(taggedfile)

		acnames = NER.collapse_names(NER.extract_tagged_names(taggedtext))
		nameset = NER.collapse_names(namesdict[fileList[i]])
		numnames = len(nameset)

		if numnames in numdict:
			numdict[numnames] = numdict[numnames] + 1
		else:
			numdict[numnames] = 1
	print(numdict )

	for i in range (0, len(fileList)):

		untaggedfile = fileList[i]
		taggedfile = testList[i]
		mytaggedfile = myList[i]

		text = get_text(untaggedfile)
		taggedtext = get_text(taggedfile)
		mytext = ""
		body = (text.split('Abstract:')[1])

		fileName = untaggedfile

		#prepare for training by adding this emails class to set
		#0 = fname, 1 = class, 2 = email
		triple = create_train_triple(taggedfile)
		docs.append(triple)
		

		print("class: " + str(triple[1]) + "\n")
		classes.add(str(triple[1]))

		print(("\n\n		information extracted from " + fileName))
		print("Topic:")
		print(Ontology.get_topic(text))
		
		#test sentence tagging
		print("tagged sentences + paragraphs:")
	 	
		#untagged = Tagger.remove_tags(body)
		paratagged = Tagger.output_tagged_para(body)
		senttagged = Tagger.output_tagged_sents(paratagged)
		#print(senttagged)

		#print(body in text)
		mytext = text.split("Abstract:")[0] + "Abstract:" + senttagged

		#evaluate
		acparas =  Tagger.extract_paragraphs(taggedtext)
		myparas =  Tagger.extract_paragraphs(mytext)
		acsents =  Tagger.extract_sentences(taggedtext)
		mysents =  Tagger.extract_sentences(mytext)

		#calculate scores and tally total
		(acc, prec, rec, f) = Eval.evaluate_generic(fileName, myparas, acparas)
		totPacc, totPprec, totPrec, totPf = (totPacc + acc, totPprec + prec, totPrec + rec, totPf + f)

		(acc, prec, rec, f) = Eval.evaluate_generic(fileName, mysents, acsents)
		totSacc, totSprec, totSrec, totSf = (totSacc + acc, totSprec + prec, totSrec + rec, totSf + f)

		#Time tagging
		print("Times found:")
		(stimes, etimes) = Tagger.output_tagged_time(mytext)
		mytext = Tagger.find_and_tag(set(stimes), "stime", mytext)
		mytext = Tagger.find_and_tag(set(etimes), "etime", mytext)
		
		acstimes = Tagger.extract_stimes(taggedtext)
		acetimes = Tagger.extract_etimes(taggedtext)

		#eval times
		(acc, prec, rec, f) = Eval.evaluate_generic(fileName, stimes, acstimes)
		totTacc, totTprec, totTrec, totTf = (totTacc + acc, totTprec + prec, totTrec + rec, totTf + f)

		(acc, prec, rec, f) = Eval.evaluate_generic(fileName, etimes, acetimes)
		totTacc, totTprec, totTrec, totTf = (totTacc + acc, totTprec + prec, totTrec + rec, totTf + f)

		names = namesdict[fileName]
		print("people")
		print(names)

		print("ACSPEAKERS: ")
		acspeakers = NER.extract_tagged_names(taggedtext) 
		print(acspeakers)

		if names != []:
			collapsednames = NER.collapse_names(names)
			#nameset = set(collapsednames)
			print(collapsednames)
			collapsednames = NER.filter_sender(collapsednames, text) 
			
			speakers = []
			if collapsednames != {}:
				speakerdict = NER.pick_speakers(collapsednames, text)
				for num in speakerdict:
					speakers.extend(collapsednames[num])

			print("SPEAKERS:")
			print(speakers)
			notspeakers = []
			for name in names:
				if not name in speakers:
					notspeakers.append(name)
			#concat not speakers into list for evaluation
			(accuracy, precision, recall, f1) = Eval.evaluate_speakers(taggedfile, speakers, notspeakers, acspeakers)
		else:
			speakers = []
			notspeakers = []

			(accuracy, precision, recall, f1) = Eval.evaluate_speakers(taggedfile, speakers, notspeakers, acspeakers)
			#x = input()

		#tag
		mytext = Tagger.find_and_tag(speakers, "speaker", mytext)
		
		#eval names
		totNacc, totNprec, totNrec, totNf = (totNacc + acc, totNprec + prec, totNrec + rec, totNf + f)

		#Location Tagging
		locations = Locations.get_all_locations(text)
		selectedlocs = Locations.pick_locations(locations, text) 

		print("Selected locations:" )
		for loc in selectedlocs:
			print("	" + loc)
		mytext = Tagger.find_and_tag(selectedlocs, "location", mytext)

		print("ACLOCS:")
		aclocs = Locations.extract_tagged_locations(taggedtext)
		for acloc in aclocs:
			print("	" + acloc)
		#notlocs
		notlocs = []
		for loc in locations:
			if not loc in selectedlocs:
				notlocs.append(loc)
		(accuracy, precision, recall, f1) = Eval.evaluate_locations(taggedfile, selectedlocs, notlocs, aclocs)	
		if precision > 1:
			raw_input()
		totLacc = totLacc + accuracy
		totLprec = totLprec + precision
		totLrec = totLrec + recall
		totLf  =totLf + f1

		print("Topic:")
		mytext = Tagger.add_ontology_tag(mytext)
		

		print("final text:" )
		print(mytext)
		print("writing to " + myList[i])
		file = open(myList[i], "w")
		file.write(mytext)
		file.close()

		if stopping:
			if (i != (len(fileList) -1)):
				print("press enter for next email:")
			else:
				print("press enter to finish")
			x = raw_input()


	print("END:")

	print("Paragraph scores:")
	print("accuracy: " + str(totPacc  / (len(fileList))))
	print("precision: " + str(totPprec  / (len(fileList))) )
	print("recall: " + str(totPrec  / (len(fileList))))
	print("f1: " + str(totPf / (len(fileList))) + "\n")

	print("Sentence scores:")
	print("accuracy: " + str(totSacc  / (len(fileList))))
	print("precision: " + str(totSprec  / (len(fileList))) )
	print("recall: " + str(totSrec  / (len(fileList))))
	print("f1: " + str(totSf / (len(fileList))) + "\n")

	print("Time scores:")
	print("accuracy: " + str(totTacc  / (2*len(fileList))))
	print("precision: " + str(totTprec  / (2*len(fileList))) )
	print("recall: " + str(totTrec  / (2*len(fileList))))
	print("f1: " + str(totTf / (2*len(fileList))) + "\n")
	
	print("NER scores:")
	print("accuracy: " + str(totNacc  / len(fileList)))
	print("precision: " + str(totNprec  / len(fileList)) )
	print("recall: " + str(totNrec  / len(fileList)))
	print("f1: " + str(totNf / len(fileList)) + "\n")
	
	print("Location scores:")
	print("accuracy: " + str(totLacc  / len(fileList)))
	print("precision: " + str(totLprec  / len(fileList)) )
	print("recall: " + str(totLrec  / len(fileList)))
	print("f1: " + str(totLf / len(fileList))  + "\n")

	print("=========================================")
	print("	Overall:")
	print("accuracy: " + str((totLacc+totPacc+totSacc+totTacc+totNacc) / (6*len(fileList))))
	print("precision: " + str((totLprec+totPprec+totSprec+totTprec+totNprec) / (6*len(fileList))) )
	print("recall: " + str((totLrec+totPrec+totSrec+totTrec+totNrec) / (6*len(fileList))))
	print("f1: " + str((totLf+totPf+totSf+totTf+totNf) / (6*len(fileList)))  + "\n")

コード例 #24

0

ファイルを表示

    master_person_list=[]
    master_org_list=[]
    master_loc_list=[]
    master_month_list=[]
    master_time_list=[]
    master_money_list=[]
    master_percent_list=[]
    master_prof_list=[]



    for i in range(0, len(sentences_list)):
        sentences_list[i]=sentences_list[i].replace(',','').replace('.','').replace('!','')
        #sentences_list[i]=sentences_list[i].replace("'",'"')
        sent_person_list,sent_org_list,sent_loc_list,sent_month_list,sent_time_list,sent_money_list,sent_percent_list,sent_prof_list=NER.named_entity_recognition(sentences_list[i])
        master_person_list.append(sent_person_list)
        master_org_list.append(sent_org_list)
        master_loc_list.append(sent_loc_list)
        master_month_list.append(sent_month_list)  #month and weekday names + season names
        master_time_list.append(sent_time_list)
        master_money_list.append(sent_money_list)
        master_percent_list.append(sent_percent_list)
        master_prof_list.append(sent_prof_list)


    ################### CATEGORIZING THE QUESTION AS WH0, WHAT, WHEN , WHY , WHERE OR HOW  ########################

    who_list,what_list,when_list,why_list,where_list,how_list=[],[],[],[],[],[]

コード例 #25

0

ファイルを表示

ファイル: Code.py プロジェクト: Jrhenderson11/NLP

	#print ("Files to be processed are: ") 
	#print (fileList)

	print ("----------------------------------------")
	docs =[]
	classes = set()

	#because NER requires training a tagger it is best done as a 
	#batch process rather than repeatedly calling a function, 
	#therefore it is done here and the results processed later
	if (len(fileList) > 10):
		print "running NER on " + str(len(fileList)) + " files (might take a bit)"
	else:
		print "running NER"

	namesdict = NER.extract_names_files(fileList)
	numdict = {}


	for i in range (0, len(fileList)):
		untaggedfile = fileList[i]
		taggedfile = fileList[i].replace("un", "")

		text = get_text(untaggedfile)
		taggedtext = get_text(taggedfile)

		acnames = NER.collapse_names(NER.extract_tagged_names(taggedtext))
		nameset = NER.collapse_names(namesdict[fileList[i]])
		numnames = len(nameset)

		if numnames in numdict:

コード例 #26

0

ファイルを表示

ファイル: test_months.py プロジェクト: AnirudhNarasimhamurthy/Natural-Language-Processing-Fall-2015

__author__ = 'Anirudh'
import NER
sentence_list=['This is January','She flew in December','In March he got a $50 ticket -- and decided to take it to court']
for i in sentence_list:
    NER.named_entity_recognition(i)

コード例 #27

0

ファイルを表示

ファイル: splitStatements.py プロジェクト: aboSamoor/lydia

def fixCols(fin, fout):
    fin = os.path.abspath(fin)
    fout = os.path.abspath(fout)
    text = NER.getText(fin)
    txt = re.sub(r"(\nO)",'\n.\tO',text)
    NER.writeText(fout, txt)

コード例 #28

0

ファイルを表示

ファイル: utf82bw.py プロジェクト: aboSamoor/lydia

#! /usr/bin/env python

import NER
import sys
import os

if __name__=="__main__":
    fin = os.path.abspath(sys.argv[1])
    fout = os.path.abspath(sys.argv[2])
    NER.utf8tobw(fin,fout)

コード例 #29

0

ファイルを表示

	#print ("Files to be processed are: ") 
	#print (fileList)

	print ("----------------------------------------")
	docs =[]
	classes = set()

	#because NER requires training a tagger it is best done as a 
	#batch process rather than repeatedly calling a function, 
	#therefore it is done here and the results processed later
	if (len(fileList) > 10):
		print "running NER on " + str(len(fileList)) + " files (might take a bit)"
	else:
		print "running NER"

	namesdict = NER.extract_names_files(fileList)
	numdict = {}


	for i in range (0, len(fileList)):
		untaggedfile = fileList[i]

		text = get_text(untaggedfile)


		nameset = NER.collapse_names(namesdict[fileList[i]])
		
		numnames = len(nameset)

		if numnames in numdict:
			numdict[numnames] = numdict[numnames] + 1

コード例 #30

0

ファイルを表示

ファイル: qa.py プロジェクト: AnirudhNarasimhamurthy/Natural-Language-Processing-Fall-2015

    master_percent_list=[]
    master_prof_list=[]


    #print 'Sentence list is:',sentences_list

    for i in range(0, len(sentences_list)):
        temp_str=sentences_list[i]
        '''sentences_list[i]=sentences_list[i].strip()
        if sentences_list[i].index(',') != -1:
            if sentences_list[i][sentences_list[i].index(',')+1]!=' ':
                sentences_list[i]=sentences_list[i].replace(',','').replace('!','')'''
        temp_str=temp_str.strip()
        temp_str=temp_str.replace(',','').replace('!','')
        #sentences_list[i]=sentences_list[i].replace("'",'"')
        sent_person_list,sent_org_list,sent_loc_list,sent_month_list,sent_time_list,sent_money_list,sent_percent_list,sent_prof_list=NER.named_entity_recognition(temp_str)
        master_person_list.append(sent_person_list)
        master_org_list.append(sent_org_list)
        master_loc_list.append(sent_loc_list)
        master_month_list.append(sent_month_list)  #month and weekday names + season names
        master_time_list.append(sent_time_list)
        master_money_list.append(sent_money_list)
        master_percent_list.append(sent_percent_list)
        master_prof_list.append(sent_prof_list)


    ################### CATEGORIZING THE QUESTION AS WH0, WHAT, WHEN , WHY , WHERE OR HOW  ########################

    who_list,what_list,when_list,why_list,where_list,how_list=[],[],[],[],[],[]

コード例 #31

0

ファイルを表示

def collect_articles():
    """
    Collects articles from Reuters and writes to database.
    """
    NER.update(news_loader.for_ner())
    NER.write_database()

コード例 #32

0

ファイルを表示

ファイル: validate.py プロジェクト: gsantia/context-sensitive-NER

def trainFinal(multi=True,
               plus2016=False,
               pluslexica=False,
               pluswiki=False,
               plusgmb=False,
               plusdev=False,
               plusconll2003=False,
               doInternal=True,
               doExternal=True,
               lowercase=False,
               externalPOS=False,
               outhandle=""):
    """train all of the given training data and write it to disk for
    future use"""

    POS_train = []

    # load the 2017 data
    training_2017 = '../data/2017/wnut17train.conll'
    with open(training_2017, 'r') as f2:
        train = re.split("\n[\t]?\n", f2.read().strip())
        if externalPOS:
            for record in train:
                if record:  #avoid empty strings
                    data = [
                        re.split('\t', d) for d in re.split("\n", record)
                        if len(re.split("\t", d)) == 2
                    ]
                    tokens, tags = zip(*data)
                    POSs = [t[1] for t in tagger.tag(tokens)]
                    POS_train.append("\n".join([
                        "\t".join([POS, tag]) for POS, tag in zip(POSs, tags)
                    ]))

    if plusconll2003:
        conll = "conll2003_toks.conll"
        with open(conll, 'r') as f:
            data_conll = re.split("\n[\t]?\n", f.read().strip())
        train.extend(data_conll)
        if externalPOS:
            conll = "conll2003_POSs.conll"
            with open(conll, 'r') as f:
                data_conll = re.split("\n[\t]?\n", f.read().strip())
            POS_train.extend(data_conll)

    if plusdev:
        dev = "../data/emerging.dev.conll"
        with open(dev, 'r') as f:
            data_dev = re.split("\n[\t]?\n", f.read().strip())
        train.extend(data_dev)
        if externalPOS:
            dev = "dev_POSs.conll"
            with open(dev, 'r') as f:
                data_dev = re.split("\n[\t]?\n", f.read().strip())
            POS_train.extend(data_dev)

    if pluswiki:
        wiki = "wiki_toks_amazing.conll"
        with open(wiki, 'r') as f:
            data_wiki = re.split("\n[\t]?\n", f.read().strip())
        train.extend(data_wiki)
        if externalPOS:
            wiki = "wiki_POSs.conll"
            with open(wiki, 'r') as f:
                data_wiki = re.split("\n[\t]?\n", f.read().strip())
            POS_train.extend(data_wiki)

    if plusgmb:
        gmb = "gmb_toks.conll"
        with open(gmb, 'r') as f:
            data_gmb = re.split("\n[\t]?\n", f.read().strip())
        train.extend(data_wiki)
        if externalPOS:
            gmb = "gmb_POSs.conll"
            with open(gmb, 'r') as f:
                data_gmb = re.split("\n[\t]?\n", f.read().strip())
            POS_train.extend(data_gmb)

    # load the 2016 data
    if plus2016:
        training_2016 = '../data/2016/data/train'
        dev_2016 = '../data/2016/data/dev'
        test_2016 = '../data/2016/data/test'

        with open(training_2016, 'r') as f:
            data_2016 = re.split("\n[\t]?\n", f.read().strip())
        with open(dev_2016, 'r') as f2:
            data_2016_dev = re.split("\n[\t]?\n", f2.read().strip())
        with open(test_2016, 'r') as f3:
            data_2016_test = re.split("\n[\t]?\n", f3.read().strip())
        train2 = data_2016 + data_2016_dev + data_2016_test

        train.extend(train2)

        if externalPOS:
            for record in train2:
                if record:  #avoid empty strings
                    data = [
                        re.split('\t', d) for d in re.split("\n", record)
                        if len(re.split("\t", d)) == 2
                    ]
                    tokens, tags = zip(*data)
                    POSs = [t[1] for t in tagger.tag(tokens)]
                    POS_train.append("\n".join([
                        "\t".join([POS, tag]) for POS, tag in zip(POSs, tags)
                    ]))

    # load and weight the lexical data
    if pluslexica:
        numtoks = 0
        weights = []
        cts = Counter()
        NEcts = defaultdict(Counter)
        numNEs = Counter()
        totalNEs = 0
        for record in train:
            if record:  #avoid empty strings
                data = [
                    re.split('\t', d) for d in re.split("\n", record)
                    if len(re.split("\t", d)) == 2
                ]
                tokens, tags = zip(*data)
                if lowercase:
                    tokens = [token.lower() for token in tokens]
                for i, token in enumerate(tokens):
                    if tags[i] == "O":
                        cts[token] += 1
                    else:
                        NEcts[tags[i]][token] += 1
                        if tags[i][0] == "B":
                            totalNEs += 1
                            numNEs[tags[i]] += 1

            weights.append([1] * len(tokens))

        lexica = {
            ## "architecture.museum": "location",
            ## "automotive.make": "corporation",
            "automotive.model": "product",
            ## "broadcast.tv_channel": "corporation",
            ## "business.consumer_company": "corporation",
            "business.consumer_product": "product",
            ## "cvg.computer_videogame": "creative-work",
            ## "cvg.cvg_developer": "corporation",
            "cvg.cvg_platform": "product",
            "firstname.5k": "person",  # note to Capcase this lexicon
            "lastname.5000": "person",  # note to Capcase with B- and I-tags
            ## "location": "location",
            "location.country": "location",
            "people.family_name":
            "person",  # note to Capcase with B- and I-tags
            "people.person.filtered": "person",
            ## "sports.sports_team": "group",
            ## "tv.tv_network": "corporation",
            ## "tv.tv_program": "creative-work",
        }

        lexNEs = Counter()
        lexrec = []
        lexweights = []
        lexterms = set()
        for lexicon in lexica:
            NEtype = lexica[lexicon]
            with open("/data/WNUT-NER-2017/data/lexicon/" + lexicon) as f:
                for line in f:
                    if lowercase:
                        line = line.strip().lower()
                    else:
                        line = line.strip()
                    tokens = [
                        s for s in re.split("([ \.\,])", line)
                        if s != " " and s
                    ]
                    if tokens:
                        lexNEs["B-" + NEtype] += 1
                        if not lowercase:
                            if (lexicon == "firstname.5k"
                                    or lexicon == "lastname.5000"
                                    or lexicon == "people.family_name"):
                                tokens = [
                                    s.capitalize()
                                    for s in re.split("([ \.\,])", line)
                                    if s != " " and s
                                ]

                        tags = ["I-" + NEtype for t in tokens]
                        tags[0] = "B-" + NEtype
                        for token, tag in zip(tokens, tags):
                            weight = 1
                            lexterms.add((token, tag, weight))
                        if (lexicon == "lastname.5000"
                                or lexicon == "people.family_name"):
                            weight = 1
                            lexterms.add((tokens[0], "I-" + NEtype, weight))

        for lexterm in lexterms:
            NEtype = "B" + lexterm[1][1:]
            lexweights.append([numNEs[NEtype] / lexNEs[NEtype]])
            record = "\t".join([lexterm[0], lexterm[1]])
            lexrec.append(record)

        for token in cts.most_common():
            lexweights.append([cts[token[0]]])
            record = "\t".join([token[0], "O"])
            lexrec.append(record)

        for NEtype in NEcts:
            if not lexNEs["B-" + NEtype[2:]]:
                for token in NEcts[NEtype].most_common():
                    lexweights.append([NEcts[NEtype][token[0]]])
                    record = "\t".join([token[0], NEtype])
                    lexrec.append(record)
    """
    #now load in the already trained extra data
    with open('extracon.txt', 'rb') as infile:
        con_counts = pickle.load(infile)
    with open('extradef.txt', 'rb') as f:
        deflike = pickle.load(f)
    """
    exdeflike = {}
    if doExternal:
        if externalPOS:
            exdeflike = NER.trainExternal(POS_train, multi)
        else:
            exdeflike = NER.trainExternal(train, multi)

    indeflike = {}
    if doInternal:
        if pluslexica:
            train.extend(lexrec)
            weights.extend(lexweights)
            indeflike = NER.trainInternal(train,
                                          weights=weights,
                                          lowercase=lowercase)
        else:
            indeflike = NER.trainInternal(train, lowercase=lowercase)

    #now write these dicts to disk
    exout = 'finalexdef_entitytype.pickle'
    inout = 'finalindef_entitytype.pickle'

    if externalPOS:
        exout = re.sub("_entitytype", "_externalPOS_entitytype", exout)

    if outhandle:
        exout = re.sub("_entitytype", "_" + outhandle + "_entitytype", exout)
        inout = re.sub("_entitytype", "_" + outhandle + "_entitytype", inout)

    if plusdev:
        exout = re.sub("_entitytype", "_plusdev_entitytype", exout)
        inout = re.sub("_entitytype", "_plusdev_entitytype", inout)

    if plusconll2003:
        exout = re.sub("_entitytype", "_plusconll2003_entitytype", exout)
        inout = re.sub("_entitytype", "_plusconll2003_entitytype", inout)

    if plus2016:
        exout = re.sub("_entitytype", "_plus2016_entitytype", exout)
        inout = re.sub("_entitytype", "_plus2016_entitytype", inout)

    if pluslexica:
        exout = re.sub("_entitytype", "_pluslexica_entitytype", exout)
        inout = re.sub("_entitytype", "_pluslexica_entitytype", inout)

    if pluswiki:
        exout = re.sub("_entitytype", "_pluswiki_entitytype", exout)
        inout = re.sub("_entitytype", "_pluswiki_entitytype", inout)

    if plusgmb:
        exout = re.sub("_entitytype", "_plusgmb_entitytype", exout)
        inout = re.sub("_entitytype", "_plusgmb_entitytype", inout)

    if lowercase:
        exout = re.sub("_entitytype", "_lowercase_entitytype", exout)
        inout = re.sub("_entitytype", "_lowercase_entitytype", inout)

    if doExternal:
        with open(exout, 'wb') as outfile:
            pickle.dump(exdeflike, outfile)

    if doInternal:
        with open(inout, 'wb') as outfile2:
            pickle.dump(indeflike, outfile2)

コード例 #33

0

ファイルを表示

ファイル: validate.py プロジェクト: gsantia/context-sensitive-NER

def crossTrain(n=10, multi=True):
    """to maximize efficiency this will create _exdeflike_ and
    _indeflike_ once and write them to JSON files so we can just
    call them later without having to train the same thing over
    and over"""

    training_2016 = '../data/2016/data/train'
    dev_2016 = '../data/2016/data/dev'
    test_2016 = '../data/2016/data/test'
    training_2017 = '../data/2017/wnut17train.conll'

    data_split = split_data(training_2017, n)
    total_indef = defaultdict(defaultdict)
    total_exdef = defaultdict(defaultdict)

    #we'll use the 2016 data in addition to do the training
    # with open(training_2016, 'r') as f:
    #     data_2016 = re.split("\n[\t]?\n", f.read().strip())
    # with open(dev_2016, 'r') as f1:
    #     dev_2016 = re.split("\n[\t]?\n", f1.read().strip())
    # with open(test_2016, 'r') as f2:
    #     test_2016 = re.split("\n[\t]?\n", f2.read().strip())

    # #add these up
    # train2 = data_2016 + dev_2016 + test_2016
    """
    print "extra con"
    with open('extracon.txt', 'rb') as infile:
        con_counts = pickle.load(infile)
    print "extra def"
    with open('extradef.txt', 'rb') as f:
        deflike = pickle.load(f)
    """
    for i, data in enumerate(data_split):
        #need to perform testing n times giving each partition of the data
        #a chance to be the test data and the rest training
        #con_counts_copy = deepcopy(con_counts)
        #deflike_copy = deepcopy(deflike)

        copy_datasplit = list(data_split)
        test = copy_datasplit.pop(i)
        #now flatten the list of n-1 lists to a single list
        train = [item for sublist in copy_datasplit for item in sublist]
        #now add the 2016 data to the train list, but don't add records
        #that already exist
        # for record in train2:
        #     if record not in train:
        #         train.append(record)

        exdeflike = NER.trainExternal(train, multi)
        indeflike = NER.trainInternal(train)

        total_exdef[i] = exdeflike
        total_indef[i] = indeflike

    #write these dicts to the disk
    with open('exdeflike.txt', 'wb') as outfile:
        pickle.dump(total_exdef, outfile)

    with open('indeflike.txt', 'wb') as f:
        pickle.dump(total_indef, f)

コード例 #34

0

ファイルを表示

ファイル: validate.py プロジェクト: gsantia/context-sensitive-NER

def final_analysis(
    exdeflikefile,
    indeflikefile,
    multi=True,
    lowercase=True,
    externalPOS=True,
    dev=True,
    thresholds={
        "location": 0.292,
        "group": 0.09,
        "product": 0.131,
        "creative-work": 1.1,
        "person": 0.202,
        "corporation": 1.1
    }):

    #load exdef and indef
    with open(exdeflikefile) as f:
        exdeflike = pickle.load(f)

    with open(indeflikefile) as f:
        indeflike = pickle.load(f)
    """train all of the given training data and then test it on the supplied
    test records. make predictions for NE for each token, then print them
    out in the format required"""
    """
    #load exdef and indef
    with open('finalexdef.pickle', 'rb') as infile:
        exdeflike = pickle.load(infile)

    with open('finalindef.pickle', 'rb') as infile2:
        indeflike = pickle.load(infile2)
    """
    #load the test data
    if dev:
        test_file = '../data/emerging.dev.conll'
        outfilename = "../data/finalpredictions/emerging_" + "_".join(
            re.split("_", indeflikefile)[1:3]) + ".dev"
    else:
        test_file = '../data/emerging.test'
        outfilename = "../data/finalpredictions/emerging_" + "_".join(
            re.split("_", indeflikefile)[1:3]) + ".test"

    with open(test_file, 'r') as f3:
        records = re.split("\n[\t]?\n", f3.read().strip())

    #analyze the test data
    #on the training data, using n-fold validation
    # f = open(test_file + ".prediction", 'w')
    f = open(outfilename, "w")

    # thresholds = {
    #     ## "location": [0.001, 0.157],
    #     "location": [1.1, 0.157],
    #     ## "group": [0.008, 0.199],
    #     "group": [1.1, 0.199],
    #     "product": [1.1, 0.215],
    #     "creative-work": [1.1,0.499],
    #     ## "person": [0.002, 0.167],
    #     "person": [1.1, 0.167],
    #     "corporation": [1.1, 0.218]
    # }

    for record in records:
        if record:  #avoid empty strings
            if dev:
                data = [
                    re.split('\t', d) for d in re.split("\n", record)
                    if len(re.split("\t", d)) == 2
                ]
                tokens, tags = zip(*data)
            else:
                tokens = [
                    re.split('\t', d)[0] for d in re.split("\n", record)
                    if len(re.split("\t", d)) == 1
                ]
            uppertokens = list(tokens)
            if lowercase:
                tokens = [token.lower() for token in tokens]
            #keep track of the NE assignments for each token with tuples
            if lowercase:
                assignments = [[token, 'O'] for token in uppertokens if token]
            else:
                assignments = [[token, 'O'] for token in tokens if token]
            ##

            predictions = {}

            for NEtype in thresholds:
                exavedeflike, inavedeflike = NER.test(tokens,
                                                      exdeflike,
                                                      indeflike,
                                                      multi,
                                                      NEtype,
                                                      externalPOS=externalPOS,
                                                      uppertokens=uppertokens)

                #find the NEs using the _LFD_ function as before
                for indices in NER.LFD(tokens, exavedeflike, inavedeflike,
                                       [1.1, thresholds[NEtype]]):
                    # if exavedeflike[indices] >= thresholds[NEtype][0]:
                    #     print NEtype+": ", [tokens[ix] for ix in indices]
                    #     print "external", exavedeflike[indices]
                    innums = [
                        inavedeflike[ix][1] for ix in indices
                        if ix != indices[0]
                    ]
                    innums.append(inavedeflike[indices[0]][0])
                    ## print "internal", NER.harmonic_mean(innums)

                    predictions[(indices, NEtype)] = [
                        len(list(indices)), exavedeflike[indices],
                        NER.harmonic_mean(innums)
                    ]
            ##
            for indices, NEtype in predictions:
                thissize = predictions[(indices, NEtype)][0]
                thislike = predictions[(indices, NEtype)][2]
                for otherindices, otherNEtype in predictions:
                    thatsize = predictions[(otherindices, otherNEtype)][0]
                    thatlike = predictions[(otherindices, otherNEtype)][2]
                    broken = True
                    for ix in otherindices:
                        if ix in indices:
                            if otherindices[0] < indices[0]:
                                print("precidence, avoided: ",
                                      [tokens[ix] for ix in indices], " over ",
                                      [tokens[ix] for ix in otherindices])
                                break
                            elif otherindices[0] == indices[0]:
                                if thatsize > thissize:
                                    print("size, avoided: ",
                                          [tokens[ix]
                                           for ix in indices], " over ",
                                          [tokens[ix] for ix in otherindices])
                                    break
                                elif thatlike > thislike:
                                    print("likelihood, avoided: " + NEtype,
                                          [tokens[ix] for ix in indices
                                           ], " over " + otherNEtype,
                                          [tokens[ix] for ix in otherindices])
                                    break
                    else:
                        broken = False
                    if broken:
                        break
                else:

                    print NEtype + ": ", [tokens[ix] for ix in indices]
                    print "internal", thislike
                    ##
                    #assign 'B' to the first, 'I' to the rest
                    n = 0
                    for index in indices:
                        if n == 0:
                            assignments[index][1] = 'B-' + NEtype
                        else:
                            assignments[index][1] = 'I-' + NEtype
                        n += 1  #keep track of position in NE

            ##

            for i, assignment in enumerate(assignments):
                if dev:
                    f.writelines(
                        "\t".join([assignment[0], tags[i], assignment[1]]) +
                        "\n")
                else:
                    f.writelines("\t".join([assignment[0], assignment[1]]) +
                                 "\n")

            f.writelines("\n")

コード例 #35

0

ファイルを表示

import NER, pickle, nltk
import itertools

corpus_root = "/Users/funktor/Downloads/gmb-2.2.0"

sentences = NER.read_gmb(corpus_root)

train_sents = itertools.islice(sentences, 50000)
test_sents = itertools.islice(sentences, 5000)

crf_model = NER.trainCRF(train_sents)

pickle.dump(crf_model, open('crf_model.sav', 'wb'))

crf_model = pickle.load(open('crf_model.sav', 'rb'))

str = "Christian Bale acted as the Batman and Heath Ledger as the Joker in the movie The Dark Knight"
print NER.predictNERSentence(str, crf_model)

print NER.testCRF(crf_model, test_sents)

tags = [
    'O', 'B-per', 'I-per', 'B-gpe', 'I-gpe', 'B-geo', 'I-geo', 'B-org',
    'I-org', 'B-tim', 'I-tim', 'B-art', 'I-art', 'B-eve', 'I-eve', 'B-nat',
    'I-nat'
]

clf = NER.trainOnline(train_sents, tags, batch_size=500)

NER.testOnline(clf, test_sents)

コード例 #36

0

ファイルを表示

ファイル: validate.py プロジェクト: gsantia/context-sensitive-NER

def final_scan(exdeflike,
               indeflike,
               multi=True,
               lowercase=False,
               externalPOS=False,
               outkey="default"):
    """train all of the given training data and then test it on the supplied
    test records. make predictions for NE for each token, then print them
    out in the format required"""
    """
    #load exdef and indef
    with open('finalexdef.pickle', 'rb') as infile:
        exdeflike = pickle.load(infile)

    with open('finalindef.pickle', 'rb') as infile2:
        indeflike = pickle.load(infile2)
    """
    #load the test data
    test_file = '../data/emerging.dev.conll'
    with open(test_file, 'r') as f3:
        records = re.split("\n[\t]?\n", f3.read().strip())

    numrecs = len(records)

    os.system("mkdir -p ../data/predictions/" + outkey)

    #analyze the test data
    # threshold = [0.138, 0.13]
    #this is the threshold we found to give the best F1 score
    #on the training data, using n-fold validation

    tstarts = {
        "location": 0.5,
        "group": 0.5,
        "product": 0.5,
        "creative-work": 0.5,
        "person": 0.5,
        "corporation": 0.5
    }

    tdiffs = range(-4,
                   5)  # [d for d in range(-49,50)] ## diffs = [-0.49--0.49]
    allresults = defaultdict(list)

    ## begin rounds loop here
    for rnd in range(1, 4):  ##range(1,2): ##
        print "working on rnd " + str(rnd)
        print "here are the starting thresholds: "
        for NEtype in tstarts:
            print NEtype, tstarts[NEtype]
        print
        results = defaultdict(list)
        tdiffs = [tdiff / 10 for tdiff in tdiffs]
        # [tdiff/100 for tdiff in tdiffs] [-0.0049--0.0049]

        NEthreshs = {
            NEtype: [tstarts[NEtype] + tdiff for tdiff in tdiffs]
            # [t/1000 for t in range(1001)]
            for NEtype in tstarts
        }
        fs = {}
        for NEtype in NEthreshs:
            for t in NEthreshs[NEtype]:
                fkey = str(t) + "-" + NEtype
                threshfile = re.sub("/data/",
                                    "/data/predictions/" + outkey + "/",
                                    test_file + "-" + fkey + ".prediction")
                fs[fkey] = [open(threshfile, 'w'), threshfile]
                fs[fkey][0].close()
        numdone = 0
        for record in records:
            print str(
                100 * numdone / numrecs) + "% done with round " + str(rnd)
            numdone += 1
            if record:  #avoid empty strings
                data = [
                    re.split('\t', d) for d in re.split("\n", record)
                    if len(re.split("\t", d)) == 2
                ]
                tokens, tags = zip(*data)
                uppertokens = list(tokens)
                if lowercase:
                    tokens = [token.lower() for token in tokens]

                for NEtype in NEthreshs:
                    exavedeflike, inavedeflike = NER.test(
                        tokens,
                        exdeflike,
                        indeflike,
                        multi,
                        NEtype,
                        externalPOS=externalPOS,
                        uppertokens=uppertokens)

                    for t in NEthreshs[NEtype]:
                        #keep track of the NE assignments
                        #for each token with tuples
                        if lowercase:
                            assignments = [[token, 'O']
                                           for token in uppertokens if token]
                        else:
                            assignments = [[token, 'O'] for token in tokens
                                           if token]
                        fkey = str(t) + "-" + NEtype
                        #find the NEs using the _LFD_ function as before
                        for indices in NER.LFD(tokens, exavedeflike,
                                               inavedeflike, [1.1, t]):

                            # print t, [tokens[ix] for ix in indices]
                            # innums = [inavedeflike[ix][1] for ix in indices if ix != indices[0]]
                            # innums.append(inavedeflike[indices[0]][0])
                            # print "internal", NER.harmonic_mean(innums)
                            # raw_input()

                            n = 0
                            for index in indices:
                                if n == 0:
                                    assignments[index][1] = 'B-' + NEtype
                                else:
                                    assignments[index][1] = 'I-' + NEtype
                                n += 1  #keep track of position in NE

                        ## write out according to file handles, here
                        fs[fkey][0] = open(fs[fkey][1], "a")
                        for i, assignment in enumerate(assignments):
                            fs[fkey][0].writelines("\t".join(
                                [assignment[0], tags[i], assignment[1]]) +
                                                   "\n")
                        fs[fkey][0].writelines("\n")
                        fs[fkey][0].close()

        ## evaluate all thresholds and all NE types for the best of the round
        for fkey in fs:
            ## fs[fkey][0].close()
            NEtype = "-".join(re.split("-", fkey)[1:])
            t = float(re.split("-", fkey)[0])
            filename = fs[fkey][1]
            try:
                results[NEtype].append((map(
                    float,
                    re.split("\;", [
                        re.sub("[^0-9\.\;]+", "", re.sub("\d+$|\d\:", "", r))
                        for r in re.split(
                            "\n",
                            subprocess.check_output(
                                "python2 ../data/wnuteval.py " + filename,
                                shell=True)) if re.search(NEtype, r)
                    ][0])), t))
            except:
                results[NEtype].append(([0., 0., 0.], t))
            allresults[NEtype].append(tuple(results[NEtype][-1]))

        ## store the best of this round as tstarts
        for NEtype in results:
            tstarts[NEtype] = max(results[NEtype], key=lambda x: x[0][2])[1]

        print "here are the end-of-round thresholds: "
        for NEtype in tstarts:
            print NEtype, tstarts[NEtype]
        print
    with open("../data/predictions/" + outkey + "/allresults.json", "w") as f:
        f.writelines(json.dumps([tstarts, allresults]))
    return tstarts, allresults

コード例 #37

0

ファイルを表示

ファイル: tools.py プロジェクト: aboSamoor/lydia

    try:
        json.dump(struct, fh,indent=0)
        fh.close()
        shutil.copy(tmpFile.name, fName)
    except:
        print >> sys.stderr, "Failed to write the structure in", fName
        print_exception()
    tmpFile.close()

def loadJson(fName):
    try:
        fh = open(fName,'rb')
        result = json.load(fh)
        fh.close()
        return result
    except:
        print >> sys.stderr, fName, "Failed to be interpreted as a Json file because of"
        print_exception()
        fh.close()
        return -1


if __name__== "__main__":
    fName = os.path.abspath(sys.argv[1])
    b2utf8(fName, fName+".utf8")
    NER.utf8tobw(fName+".utf8", fName+".BW")
    b2utf8(fName+".BW", fName+".BW.utf8")
    fixCols(fName+".BW.utf8")

コード例 #38

0

ファイルを表示

ファイル: Code.py プロジェクト: Jrhenderson11/NLP

def process_email(text):
	email = (ProcessedEmail(text))
	Tagger.output_tagged_para(email.body)
	print ("===========================================")
	NER.print_names_text(text)

コード例 #39

0

ファイルを表示

ファイル: what.py プロジェクト: nrvnujd/Natural-Language-Processing-Fall-2015

def answering_what(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_time_list,sent_person_list):

    # Declaring globals to be used in this function

    candidate_sent_list=[]
    sent_score_list=[]
    final_sent_list=[]
    q_verblist=[]


    stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from',
                          'has','have','he','in','is','it','its','of','on','that','the',
                          'to','was','were','will','with']


    what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']

    what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec']


    abbreviation_list=[('Mt.','Mount')]

    temp_q=cleansedQuestion
    temp_q=temp_q.replace('"','')
    #temp_q=temp_q.replace("'",'"')
    temp_q=temp_q.replace('?','')

    for k in temp_q.split():
        if k in abbreviation_list[0][0]:
            temp_q=temp_q.replace(k,abbreviation_list[0][1])

    #print 'Question is :',temp_q


    q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q)



    lmtzr=WordNetLemmatizer()
    pos_list= POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v'))

    #print 'Question verb list is :',q_verblist


    for i in range(0,len(complete_sentence_list)):
        score=0

        #print complete_sentence_list[i]
        # 1. Word Match scoring function for each of the sentences
        score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])

        #print 'Score after wordmatch is :',score
        #2. Check if the question contains a month of the year and sentence contains date expression,then it is a clue
        for k in temp_q.split():
            if k.lower() in what_month:
                if sent_time_list[i] != []:
                    score=score + 4

                #print 'Score after Rule 2 is :',score
            # 3. What "kind" questions. Sentences containing "call" or "from"
            elif k.lower() =='kind':
                for m in complete_sentence_list[i].split():
                    if lmtzr.lemmatize(m,'v') in ['call','from']:
                        score=score+6
                #print 'Score after Rule 3 is :',score

            # 4. If question contains "name" and the sentence contains {name,call,known}

            elif k.lower() =='name':
                for m in complete_sentence_list[i].split():
                    if lmtzr.lemmatize(m,'v')  in ['name','call','known']:
                        score=score+20

                #print 'Score after Rule 4 is :',score

        '''if q_person_list !=[]:
            if sent_person_list[i] !=[]:
                score=score+6'''
        #print 'Score after Rule 4 is :',score


        #5. If question contains name + PP and contains(S,ProperNoun) and Head PP

        '''if j != len(temp) -1 and temp[j]=='name' and temp[j+1] in ['of','for']:
             person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i])
             if person_list != []:
                 #TODO Check if it also contains (proper_noun,head(PP))
                 score=score +20'''

         # 6.  Reward sentences which has the verb appearing in the question in its sentence

        sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i])

        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score=score + 6

        # 7. Definition type questions or what is X or what are X  questions ?

        temp_list=temp_q.split()


        if len(temp_list) <= 6:
            if '(' in complete_sentence_list[i]:
                for k in temp_list:
                    if k in complete_sentence_list[i].split():
                        start_index= complete_sentence_list[i].index('(') + 1
                        end_index=complete_sentence_list[i].index(')')
                        score=score+20
                        return complete_sentence_list[i][start_index:end_index]

            elif '--' in complete_sentence_list[i]:
                for k in temp_list:
                    if k in complete_sentence_list[i].split():
                        start_index= complete_sentence_list[i].index('--') + 1
                        end_index=complete_sentence_list[i].index('--')
                        score=score+20
                        return complete_sentence_list[i][start_index:end_index]
            elif '{' in complete_sentence_list[i]:
                for k in temp_list:
                    if k in complete_sentence_list[i].split():
                        start_index= complete_sentence_list[i].index('{') + 1
                        end_index=complete_sentence_list[i].index('}')
                        score=score+20
                        return complete_sentence_list[i][start_index:end_index]



            # If the question contains "sport" related terms, answer should also have sport related terms
            '''if temp[j].lower() in ['sports','games','olympics']:
                temp2=sentence_list[i].split()
                for k in range(0,len(temp2)):
                    if snowball_stemmer.stem(temp2[k]) in ['soccer','hockey','baseball','cricket','rugby','ultimate']:
                        score=score+6'''

            # If the sentence contains a  "country" name and the sentence contains a LOCATION, then it is confident score
            '''if temp[j].lower() in ['country','countries','olympics']:
                person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i])
                if loc_list != []:
                    score=score + 6*len(loc_list)'''  # Confidence score increases with increasing number of countries appearing in the sentence.



        sent_score_list.append(score)

    #print 'Sent score list values are:',sent_score_list

    # Selecting the sentence that has the maximum score.

    max_score_value =max(sent_score_list)


    # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences
    # which have both maximum value and present in candidate list

    for i in range(0, len(sent_score_list)):
         if sent_score_list[i]==max_score_value:
                final_sent_list.append(complete_sentence_list[i])

    #print 'Final list is:', final_sent_list

    answer_list=[]

    if len(final_sent_list) == 1:
        temp= final_sent_list[0].split()
        '''for k in range(0,len(temp)):
            if temp[k].lower() =='to':
                return ' '.join(temp[k:])'''

        #print temp_q.split()
        for k in range(0,len(temp)):

            if k !=0 or k!=len(temp)-1:
                if temp[k].lower()=='per' and temp[k+1].lower()=='cent':
                    return ' '.join(temp[k-1:k+2])

            if temp[k] not in temp_q.split():
                #print temp[k]
                answer_list.append(temp[k])

        return ' '.join(answer_list)

    else:

        for i in range(0,len(final_sent_list)):
            result=final_sent_list[i]
            break

        temp= result.split()
        '''for k in range(0,len(temp)):
            if temp[k].lower() =='to':
                return ' '.join(temp[k:])
            else:
                temp=result'''



        for k in range(0, len(temp)):
            if temp[k] not in temp_q.split():
                answer_list.append(temp[k])

        return ' '.join(answer_list)

コード例 #40

0

ファイルを表示

# -*- coding: utf-8 -*-
import OCR
import NER
from flask import Flask, request
import os
import json

# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
OCR_test = OCR.testOCR()  # 載入OCR模組
NER_test = NER.testNER()  # 載入NER模組

# Flask應用來源:https://www.cnblogs.com/lsdb/p/10488448.html
app = Flask(__name__)
basedir = os.path.abspath(os.path.dirname(__file__))


# route()方法用于设定路由；类似spring路由配置
#等价于在方法后写：app.add_url_rule('/', 'helloworld', hello_world)
@app.route('/')
def hello_world():
    return 'Hello, World!'


# Python flask.request raw應用來源:http://codingdict.com/sources/py/flask.request/4366.html


@app.route('/cht/up_photo', methods=['post'])
def up_photo_cht():
    a = request.get_data()
    dict1 = json.loads(a)
    base64_data = dict1["photo"]

Python NER, CompQAの例