def main(): dev = load() dev_inner = dev["data"] result = {} classifier = V.classifier() for documents in dev_inner: for paragraph in documents['paragraphs']: pos_dict = {} word_dict = {} entities = {} entity_dict = {} para = normalize(paragraph['context']) tokens = para.encode("utf-8").split() #encodetoken=[] #for x in tokens: # encodetoken.append(x.encode("utf-8")) tagged = nltk.pos_tag(nltk.word_tokenize(paragraph['context'])) #tagged = nltk.pos_tag(tokens) named_entities = nltk.chunk.ne_chunk(tagged) tags = named_entities.pos() for t in tags: word = t[0][0] if t[0][1] not in pos_dict: pos_dict[t[0][1]] = [] pos_dict[t[0][1]].append(t[0][0]) elif t[0][0] not in pos_dict[t[0][1]]: pos_dict[t[0][1]].append(t[0][0]) if t[0][0] not in word_dict: word_dict[t[0][0]] = [] word_dict[t[0][0]].append(t[0][1]) elif t[0][1] not in word_dict[t[0][0]]: word_dict[t[0][0]].append(t[0][1]) if t[1] not in entities: entities[t[1]] = [] entities[t[1]].append(t[0][0]) elif t[0][0] not in entities[t[1]]: entities[t[1]].append(t[0][0]) if word not in entity_dict: entity_dict[word] = [] entity_dict[word].append(t[1]) elif t[1] not in entity_dict[word]: entity_dict[word].append(t[1]) # for e in entities: # print e # print entities[e] paragraph_tokens=paragraph['context'].split() ##get tokens from context numofsen=0 ##number of sentences sentence={} sentence[numofsen] = ([]) for x in paragraph_tokens: sentence[numofsen].append(normalize(x)) if '.'in x or ','in x or '?'in x or '!'in x or ';'in x or ':'in x: ##if reach any punctuation, sign as the end of the sentence, or the sentence will be very long numofsen+=1 sentence[numofsen] = ([]) sentence_vectors = [] for i in range(0,numofsen): sentence_vectors.append(classifier.create_avg_vector(sentence[i])) for qa in paragraph['qas']: ##for each question, compute the unigram overlap tag= "OTHER" question = qa['question'].lower() if "what time" in question or "which time" in question or "what year" in question or "which year" in question or "what century" in question or "which century" in question or "what month" in question or "which month" in question or "what decade" in question or "which decade" in question: tag = "TIM" elif "what place" in question or "which place" in question or "what area" in question or "which area" in question or "what town" in question or "which town" in question or "what state" in question or "which state" in question or "what city" in question or "which city" in question or "what country" in question or "which country" in question: tag = "LOC" elif "what person" in question or "which person" in question: tag = "PER" elif "what" in question or "which" in question: tag = "OTHER1" #check for anytime what/how comes before when, tag as OTHER #check for "on what" tag as LOC #check for what followed by a LOC/PER/TIM tag #check for "what time" tag as TIM elif "where" in question: tag = "LOC" elif "when" in question: tag = "TIM" elif "who" in question: #covers whom/whose as well tag = "PER" else: tag = "OTHER2" maxsim = -1 ##maximum of the similarity maxnum = 0 ##the sentence with the maximal similarity qa_tokens=normalize(qa['question']).split() qa_vector = classifier.create_avg_vector(qa_tokens) for i in range(0,numofsen): total=0 total= classifier.cosine_similarity(sentence_vectors[i], qa_vector)##compute the similarity # print total if total>=maxsim: ##find the maximum similarity #print ("in if", i, total) maxsim=total maxnum=i better_qa(tag, sentence[maxnum], numofsen, qa['id'], qa['question'], entities, word_dict, result) store(result) ##write the answers to json file
with open('findsentence.json', 'w') as json_file: json_file.write(json.dumps(data)) ##transform json file to dictionary def load(): with open('training.json') as json_file: data = json.load(json_file) return data if __name__ == "__main__": data = {} data = load() dataset = data['data'] result = {} classifier = V.classifier() ##here we compute unigram overlap, because the order of the words in the questions are often different from contexts, unigram is more reliable for article in dataset: for paragraph in article['paragraphs']: paragraph_tokens=paragraph['context'].split() ##get tokens from context numofsen=0 ##number of sentences sentence={} sentence[numofsen] = ([]) for x in paragraph_tokens: sentence[numofsen].append(normalize(x)) if '.'in x or ',' in x or '?' in x or '!' in x or ';' in x or ':' in x: ##sign as the end of the sentence numofsen+=1 sentence[numofsen] = ([]) #print('start_classifier')