def main(): parser = argparse.ArgumentParser(description="""\ Creates tag statistics. """) parser.add_argument("-I", "--input", required=True, help="input file") parser.add_argument("-O", "--output", required=True, help="output file") parser.add_argument("-L", "--lexicon", required=True, help="lexicon file") parser.add_argument("-M", "--max", help="maximum output") args = parser.parse_args() if args.input and args.output and args.lexicon: lexicon = json.load(open(args.lexicon)) with codecs.open(args.output, "w", "utf-8") as out: wt = defaultdict(set) wc = Counter() wtc = Counter() for sentence in codecs.open(args.input, "r", "utf-8"): tokens = [str2tuple(token) for token in sentence.split()] for word, tag in tokens: wt[word].add(tag) wc[word] += 1 wtc[tuple2str((word, tag))] += 1 r = {"Count": [], "Words": [], "Found": [], "Lexicon": []} if args.max: max_num = int(args.max) else: max_num = None for word, count in wc.most_common(max_num): r["Words"].append(word) r["Count"].append(count) tg = set() for tag in wt[word]: t = tuple2str((word, tag)) in_lex = "" if lexicon.get(word.lower()): if tag not in lexicon.get(word.lower()): in_lex = "*" tg.add((tag + in_lex, wtc[t])) tg = sorted(tg, key=lambda k: k[1], reverse=True) r["Found"].append(", ".join([u"{0} ({1})".format(x, y) for x, y in tg])) if lexicon.get(word.lower()): r["Lexicon"].append(", ".join(lexicon.get(word.lower()))) else: r["Lexicon"].append("") out.write(u"{0}".format(tabulate(r, headers="keys", tablefmt="pipe"))) else: print parser.print_help()
def main(): parser = argparse.ArgumentParser(description="""\ Creates tag statistics. """) parser.add_argument("-I", "--input", required=True, help="input file") parser.add_argument("-O", "--output", required=True, help="output file") parser.add_argument("-L", "--lexicon", required=True, help="lexicon file") parser.add_argument("-M", "--max", help="maximum output") args = parser.parse_args() if args.input and args.output and args.lexicon: lexicon = json.load(open(args.lexicon)) with codecs.open(args.output, "w", "utf-8") as out: wt = defaultdict(set) wc = Counter() wtc = Counter() for sentence in codecs.open(args.input, "r", "utf-8"): tokens = [str2tuple(token) for token in sentence.split()] for word, tag in tokens: wt[word].add(tag) wc[word] += 1 wtc[tuple2str((word, tag))] += 1 r = {"Count": [], "Words": [], "Found": [], "Lexicon": []} if args.max: max_num = int(args.max) else: max_num = None for word, count in wc.most_common(max_num): r["Words"].append(word) r["Count"].append(count) tg = set() for tag in wt[word]: t = tuple2str((word, tag)) in_lex = "" if lexicon.get(word.lower()): if tag not in lexicon.get(word.lower()): in_lex = "*" tg.add((tag + in_lex, wtc[t])) tg = sorted(tg, key=lambda k: k[1], reverse=True) r["Found"].append(", ".join( [u"{0} ({1})".format(x, y) for x, y in tg])) if lexicon.get(word.lower()): r["Lexicon"].append(", ".join(lexicon.get(word.lower()))) else: r["Lexicon"].append("") out.write(u"{0}".format( tabulate(r, headers="keys", tablefmt="pipe"))) else: print parser.print_help()
def deeperNLPFeatures(dir_file="./training/"): files = listdir(dir_file) stemmer = PorterStemmer() file_id = 0 for f in files: file_id += 1 with open(dir_file + f, 'r', encoding='ISO-8859-1') as text_file: text = text_file.read() text = text.strip().lower() tokens = word_tokenize(text) # head_word = get_dependency_relations(text) tagged_tok = pos_tag(tokens) tagged_list = [tuple2str(t) for t in tagged_tok] tokens_clean = deleteStopWords(tokens) lemma_line = get_lemmatized_line(tagged_tok) stem_line = [stemmer.stem(t) for t in tokens_clean] synonyms, hypernyms, hyponyms, meronyms, holonymns = get_semantic_features( tagged_tok, tokens) print(dir_file + f, ' ===> \n', 'text: \n', text, '\n', 'tokens: \n', tokens, '\n', 'pos tag: \n', tagged_tok, '\n', 'remove_stopWords: \n', tokens_clean, '\n', 'lemmatized: \n', lemma_line, '\n', 'stemmed: \n', stem_line, '\n', 'synonyms: \n', synonyms, '\n', 'hypernyms: \n', hypernyms, '\n', 'hyponyms: \n', hyponyms, '\n', 'meronyms: \n', meronyms, '\n', 'holonymns: \n', holonymns, '\n\n')
def fix(corpus_ud, corpus_itb, corpus_out): itb = map( lambda tagged: map(str2tuple, tagged.split()), io.open(corpus_itb, 'r', encoding='utf-8').read().strip().split('\n')) ud = map( lambda tagged: map(str2tuple, tagged.split()), io.open(corpus_ud, 'r', encoding='utf-8').read().strip().split('\n')) corpus = [] for x, y in zip(ud, itb): sent = [] for xx, yy in zip(x, y): xx, yy = list(xx), list(yy) if xx[1] == 'PROPN': yy[1] = 'E--' if 'X--' in yy[1] or 'F--' in yy[1]: if xx[1] == 'NOUN': yy[1] = yy[1].replace('X--', 'NSD').replace('F--', 'NSD') elif xx[1] == 'VERB': yy[1] = yy[1].replace('X--', 'VSA').replace('F--', 'NSD') elif xx[1] == 'ADJ': yy[1] = yy[1].replace('X--', 'ASP').replace('F--', 'ASP') elif xx[1] == 'ADV': yy[1] = yy[1].replace('X--', 'D--').replace('F--', 'D--') elif xx[1] == 'ADP': yy[1] = yy[1].replace('X--', 'R--').replace('F--', 'R--') elif xx[1] == 'DET': yy[1] = yy[1].replace('X--', 'B--').replace('F--', 'B--') sent.append(tuple2str(yy)) corpus.append(sent) with io.open(corpus_out, 'w', encoding='utf-8') as out: for sent in corpus: out.write(' '.join(sent)) out.write('\n')
def pos_tag_raw_text(self, text, as_tuple_list=True): # Unfortunately for the moment there is no method to do sentence split + pos tagging in nltk.parse.corenlp # Ony raw_tag_sents is available but assumes a list of str (so it assumes the sentence are already split) # We create a small custom function highly inspired from raw_tag_sents to do both def raw_tag_text(): """ Perform tokenizing sentence splitting and PosTagging and keep the sentence splits structure """ properties = {'annotators': 'tokenize,ssplit,pos'} tagged_data = self.parser.api_call(text, properties=properties) for tagged_sentence in tagged_data['sentences']: yield [(token['word'], token['pos']) for token in tagged_sentence['tokens']] tagged_text = list(raw_tag_text()) if as_tuple_list: return tagged_text return '[ENDSENT]'.join([ ' '.join([ tuple2str(tagged_token, self.separator) for tagged_token in sent ]) for sent in tagged_text ])
def fix(corpus_ud, corpus_itb, corpus_out): itb = map(lambda tagged: map(str2tuple, tagged.split()), codecs.open(corpus_itb, 'r', 'utf-8').read().strip().split('\n')) ud = map(lambda tagged: map(str2tuple, tagged.split()), codecs.open(corpus_ud, 'r', 'utf-8').read().strip().split('\n')) corpus = [] for x, y in zip(ud, itb): sent = [] for xx, yy in zip(x, y): xx, yy = list(xx), list(yy) if xx[1] == 'PROPN': yy[1] = 'E--' if 'X--' in yy[1] or 'F--' in yy[1]: if xx[1] == 'NOUN': yy[1] = yy[1].replace('X--', 'NSD').replace('F--', 'NSD') elif xx[1] == 'VERB': yy[1] = yy[1].replace('X--', 'VSA').replace('F--', 'NSD') elif xx[1] == 'ADJ': yy[1] = yy[1].replace('X--', 'ASP').replace('F--', 'ASP') elif xx[1] == 'ADV': yy[1] = yy[1].replace('X--', 'D--').replace('F--', 'D--') elif xx[1] == 'ADP': yy[1] = yy[1].replace('X--', 'R--').replace('F--', 'R--') elif xx[1] == 'DET': yy[1] = yy[1].replace('X--', 'B--').replace('F--', 'B--') sent.append(tuple2str(yy)) corpus.append(sent) with codecs.open(corpus_out, 'w', 'utf-8') as out: for sent in corpus: out.write(' '.join(sent)) out.write('\n')
def tokenize(file_path): ''' Helper function to preprocess and tokenize articles. 1) Lowercase 2) Tokenize using Punkt tokenizer 3) Part-of-speech tag using Averaged Perceptron tagger 4) Lemmatize using WordNet lemmatizer 5) Filter out stopwords Parameters ---------- file_path : string Path to article to be tokenized. ''' tokens = [] stopwords = set(nltk.corpus.stopwords.words('english')) lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() with open(file_path, 'r') as f: data = f.read().lower() for sent in nltk.sent_tokenize(data): for word in nltk.word_tokenize(sent): tokens += [word] tagged_tokens = nltk.pos_tag(tokens) if len(tagged_tokens) <= 100: # Assume that this document is from corpus 2 # Strip proper nouns and cardinal numbers tokens = [ tuple2str((lemmatizer.lemmatize(token, penn_to_wordnet(tag)), tag)) for token, tag in tagged_tokens if (token not in stopwords and tag not in ['NNP', 'NNPS', 'CD']) ] else: tokens = [ tuple2str((lemmatizer.lemmatize(token, penn_to_wordnet(tag)), tag)) for token, tag in tagged_tokens if token not in stopwords ] return tokens
def pos_tag_raw_text(self, text, as_tuple_list=True): """ Implementation of abstract method from PosTagging @see PosTagging """ tagged_text = self.tagger.tag_sents([self.sent_tokenizer.sentences_from_text(text)]) if as_tuple_list: return tagged_text return '[ENDSENT]'.join( [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text])
def chunked_sent_string(self, sent): parts = [] for word, tag in sent: try: brack = word in u'[]' except: brack = False if brack: # brackets don't get a tag parts.append(word) else: # make sure no brackets or slashes in tag tag = tag.replace(u'[', u'(').replace(u']', u')').replace(u'/', '|') parts.append(tuple2str((word, tag))) return ' '.join(parts)
def createIndex(instance_url='http://localhost:8983/solr/collection_1/', dir_file="./training/"): #def indexer( dir_file = "./training/"): ''' load the solr instace and index from the dir_file ''' solr = pysolr.Solr(instance_url) files = listdir(dir_file) data = [] stemmer = PorterStemmer() file_id = 0 for f in files: file_id += 1 with open(dir_file + f, 'r', encoding='ISO-8859-1') as text_file: text = text_file.read() text = text.strip().lower() tokens = word_tokenize(text) # head_word = get_dependency_relations(text) tagged_tok = pos_tag(tokens) tagged_list = [tuple2str(t) for t in tagged_tok] tokens_clean = deleteStopWords(tokens) lemma_line = get_lemmatized_line(tagged_tok) stem_line = [stemmer.stem(t) for t in tokens_clean] synonyms, hypernyms, hyponyms, meronyms, holonymns = get_semantic_features( tagged_tok, tokens) data.append({ 'id': f + '_' + str(file_id), 'text': ' '.join(tokens), 'pos_tag': ' '.join(tagged_list), 'text_clean': ' '.join(tokens_clean), 'lemmas': lemma_line, 'stems': ' '.join(stem_line), 'synonyms': synonyms, 'hypernyms': hypernyms, 'hyponyms': hyponyms, 'meronyms': meronyms, 'holonymns': holonymns, # 'head_word': head_word, }) # print(data) if data: solr.add(data)
def pos_tag_raw_text(self, text, as_tuple_list=True): # Unfortunately for the moment there is no method to do sentence split + pos tagging in nltk.parse.corenlp # Ony raw_tag_sents is available but assumes a list of str (so it assumes the sentence are already split) # We create a small custom function highly inspired from raw_tag_sents to do both parsed_text = self.parser(text) sentences = parsed_text.sentences def raw_tag_text(): """ Perform tokenizing sentence splitting and PosTagging and keep the sentence splits structure """ for tagged_sentence in sentences: yield [(token.text, token.xpos) for token in tagged_sentence.words] tagged_text = list(raw_tag_text()) if as_tuple_list: return tagged_text, sentences return '[ENDSENT]'.join( [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text]), sentences
def match(query, method, instance_url): ''' search solr index using user query ''' q_list = [] q_string = '' solr = pysolr.Solr(instance_url) tokens = word_tokenize(query) stemmer = PorterStemmer() tagged_tokens = pos_tag(tokens) tagged_list = [tuple2str(t) for t in tagged_tokens] tokens_clean = deleteStopWords(tokens) lemmas = get_lemmatized_line(tagged_tokens) stem_line = [stemmer.stem(t) for t in tokens_clean] synonyms, hypernyms, hyponyms, meronyms, holonyms = get_semantic_features( tagged_tokens, tokens) # head_words = get_dependency_relations(lemmas, True) print('user input features', ' ===> \n', 'text: \n', query, '\n', 'tokens: \n', tokens, '\n', 'pos tag: \n', tagged_tokens, '\n', 'remove_stopWords: \n', tokens_clean, '\n', 'lemmatized: \n', lemmas, '\n', 'stemmed: \n', stem_line, '\n', 'synonyms: \n', synonyms, '\n', 'hypernyms: \n', hypernyms, '\n', 'hyponyms: \n', hyponyms, '\n', 'meronyms: \n', meronyms, '\n', 'holonymns: \n', holonyms, '\n\n') pos_tag_data = '&'.join(tagged_list) lemmas = '&'.join(lemmas.split()) stems = '&'.join(stem_line) synonyms = '&'.join(synonyms.split()) hypernyms = '&'.join(hypernyms.split()) hyponyms = '&'.join(hyponyms.split()) holonyms = '&'.join(holonyms.split()) meronyms = '&'.join(meronyms.split()) if method == 3: # head_words = '&'.join(head_words.split()) if tokens: q_list.append('text:' + '&'.join(tokens)) if tokens_clean: q_list.append('text_clean:' + '&'.join(tokens_clean)) if pos_tag_data: q_list.append('pos_tag:' + pos_tag_data) if lemmas: q_list.append('lemmas:' + lemmas) if stems: q_list.append('stems:' + stems) if synonyms: q_list.append('synonyms:' + synonyms) if hypernyms: q_list.append('hypernyms:' + hypernyms) if hyponyms: q_list.append('hyponyms:' + hyponyms) if meronyms: q_list.append('meronyms:' + meronyms) if holonyms: q_list.append('holonymns:' + holonyms) # if head_words: # q_list.append('head_word:' + head_words) if method == 4: if tokens: q_list.append('text:' + '&'.join(tokens) + '^0.5') if pos_tag_data: q_list.append('pos_tag:' + pos_tag_data + '^0.02') if lemmas: q_list.append('lemmas:' + lemmas + '^4') if tokens_clean: q_list.append('text_clean:' + '&'.join(tokens_clean) + '^5') if stems: q_list.append('stems:' + stems + '^1.5') if synonyms: q_list.append('synonyms:' + synonyms + '5') if hypernyms: q_list.append('hypernyms:' + hypernyms + '^5') # if hyponyms: # q_list.append('hyponyms:' + hyponyms + '^4') # if head_words: # q_list.append('head_word:' + head_words + '^0.5') # if meronyms: # q_list.append('meronyms:' + meronyms + '^1.4') # if holonyms: # q_list.append('holonymns:' + holonyms + '^1.4') q_string = ', '.join(q_list) print('The Solr query is q=%s, fl=\'id,text\'\n' % (q_string)) result = solr.search(q=q_string, fl='id,text') for r in result: print(r['id']) print(' '.join(r['text']))
import nltk taggedtok = ('bear', 'NN') from nltk.tag.util import tuple2str print(tuple2str(taggedtok))
def test(test_file, output_file, transition_probabilities, emission_probabilities): """ Performs part-of-speech tagging on each word in a sentence in the test file using Viterbi algorithm. Probabilities are calculated on a logarithmic scale to avoid problems associated with multiplying small floating point numbers. For cases where emission probabilities of a token given a tag were not encountered in training data, the emission probability is treated as 0. As probabilities are being calculated on a logarithmic scale, and log(0) is undefined, log(float_info.min) is used as a substitute, representing the logarithmic value as it approaches 0. Side effects of substituting "0" probabilities with log(float_info.min): - in cases where there are no unknown tokens (defined as tokens not encountered in the tagged training corpus), log(float_info.min) serves as a heavy penalty to a path's probability where a "0" probability occurs, relative to other paths without a "0" probability. When considering the best paths for a token's tag types, a path with log(float_info.min) in its probability will effectively fare poorer in comparisons with paths without such "0" probabilities, as log(float_info.min) will be several magnitudes smaller than the smallest probabilities. - in cases where there are unknown tokens, every emission probability related to that token given a tag type will be "0" (i.e. log(float_info.min)). As path probabilities are being calculated on a logarithmic scale, probabilities are being added and not multiplied, thus encountering a single unknown token in a sequence will not render all paths in the trellis 0. Instead, log(float_info.min) would be a flat penalty across all paths, thus effectively cancelling each other out in comparisons. In which case where unknown tokens are encountered, emission probabilities of that token are not considered. Rather, only the tag types from the previous token, and the transition probability between tag types play a part in determining the best paths. Viterbi nodes are initialized to -float_info.max such that in comparisons to determine the best paths, it will be replaced by even the worst of path probabilities, as -float_info.max is a googol googol googol (that's 300 zeroes) magnitudes smaller than log(float_info.min). Even if a sentence contains several unknown tokens, the resulting path probabilities would still be larger than -float_info.max. :param test_file: the file containing sentences to perform part-of-speech tagging on :param output_file: the file to write the part-of-speech tagged sentences to :param transition_probabilities: transition probabilities of a trained hidden Markov model, on a logarithmic scale :param emission_probabilities: emission probabilities of a trained hidden Markov model, on a logarithmic scale """ with open(test_file, "r") as test_data, open(output_file, "w") as output: for line in tqdm(test_data, total=rawcount(test_file), desc="Testing "): tokens = tuple(line.split()) tag_types = list(load('help/tagsets/upenn_tagset.pickle').keys() ) + ["-LRB-", "-RRB-", "#"] tag_types = [x for x in tag_types if x not in ["(", ")", "--"] ] # The tagset in nltk uses different notations # Initialize required arrays to model the Viterbi trellis for given test input. # The viterbi array keeps track of the best probability path to a token's tag type from the previous token. # For each best path, the backpointer array keeps track of the tag type in the previous token. viterbi = np.full((len(tokens), len(tag_types)), -float_info.max) backpointer = np.full((len(tokens), len(tag_types)), -1, dtype=np.int) # Initialize paths in trellis from start to tag types (states) corresponding to first token (observation) for t_index, tag in enumerate(tag_types): viterbi[0][t_index] = transition_probabilities[("START", tag)] \ + emission_probabilities.get((tokens[0], tag), log(float_info.min)) # Iteratively fill out Viterbi path probabilities between tag types of each token and the tag types of the # token immediately preceding it in the sequence if len(tokens) > 1: for token_index, (prev_token, curr_token) in enumerate(pairwise(tokens)): for ctag_index, curr_tag in enumerate(tag_types): for ptag_index, prev_tag in enumerate(tag_types): temp_viterbi = viterbi[token_index][ptag_index] \ + transition_probabilities[(prev_tag, curr_tag)] \ + emission_probabilities.get((curr_token, curr_tag), log(float_info.min)) if temp_viterbi >= viterbi[token_index + 1][ctag_index]: viterbi[token_index + 1][ctag_index] = temp_viterbi backpointer[token_index + 1][ctag_index] = ptag_index # Determine the best terminating path from the last token last_token_index = len(tokens) - 1 end_viterbi = -float_info.max end_backpointer = -1 for tag_index, prev_tag in enumerate(tag_types): temp_viterbi = viterbi[last_token_index][tag_index] \ + transition_probabilities[(prev_tag, "END")] if temp_viterbi >= end_viterbi: end_viterbi = temp_viterbi end_backpointer = tag_index # Perform Viterbi backtrace, finding the most likely tag type sequence through best paths to the beginning likeliest_tag_indexes = [-1] * len(tokens) likeliest_tag_indexes[-1] = end_backpointer for token_index in reversed(range(len(tokens) - 1)): likeliest_tag_indexes[token_index] = backpointer[ token_index + 1][likeliest_tag_indexes[token_index + 1]] # Formatting output likeliest_tags = [ tag_types[index] for index in likeliest_tag_indexes ] pos_tagged_line = ' '.join([ tuple2str(tagged_token) for tagged_token in list(zip(tokens, likeliest_tags)) ]) output.write(pos_tagged_line + "\n")
def searchintask4_default(query): solrInstance = 'http://localhost:8983/solr/task3/' solr = pysolr.Solr(solrInstance) stemmer = PorterStemmer() tokensTaggedTest = pos_tag(query) head_words = dependencyRel(' '.join(query)) synonymsTest, hypernymsTest, hyponymsTest, meronymsTest, holonymsTest = getFeatures( tokensTaggedTest, query) lemmasTest = getLemmas(tokensTaggedTest) stem11 = [stemmer.stem(t) for t in query] stemTest = ' '.join(stem11) listTagged = [tuple2str(t) for t in tokensTaggedTest] posData = ' '.join(listTagged) posData = '&'.join(posData.split()) lemmasTest = '&'.join(lemmasTest.split()) stemTest = '&'.join(stemTest.split()) synonymsTest = '&'.join(synonymsTest.split()) hypernymsTest = '&'.join(hypernymsTest.split()) hyponymsTest = '&'.join(hyponymsTest.split()) holonymsTest = '&'.join(holonymsTest.split()) meronymsTest = '&'.join(meronymsTest.split()) head_words = '&'.join(head_words.split()) q_list = [] if query: q_list.append('faq:' + ' '.join(query) + '^1.8') q_list.append('faq_ans' + ' '.join(query) + '^0.8') if posData: q_list.append('pos_tag_q:' + posData + '^0.02') q_list.append('pos_tag_a:' + posData + '^0.001') if lemmasTest: q_list.append('lemma_q:' + lemmasTest + '^2.0') q_list.append('lemma_a:' + lemmasTest + '^1.0') # if stemTest: # q_list.append('stem:' + stemTest + '^1.5') if synonymsTest: q_list.append('synonyms_q:' + synonymsTest + '^3.0') q_list.append('synonyms_a:' + synonymsTest + '^1.5') if hypernymsTest: q_list.append('hypernyms_q:' + hypernymsTest + '^4.0') q_list.append('hypernyms_a:' + hypernymsTest + '^3.5') if head_words: q_list.append('head_words_q:' + head_words + '^3.0') q_list.append('head_words_a:' + head_words + '^2.0') if hyponymsTest: q_list.append('hyponyms_q:' + hyponymsTest + '^0.24') q_list.append('hyponyms_a:' + hyponymsTest + '^0.14') if meronymsTest: q_list.append('meronyms_q:' + meronymsTest + '^0.14') q_list.append('meronyms_a:' + meronymsTest + '^0.10') if holonymsTest: q_list.append('holonyms_q:' + holonymsTest + '^0.14') q_list.append('holonyms_a:' + holonymsTest + '^0.10') # print(','.join(q_list)) q_string = ', '.join(q_list) print("Query is: ") print("q=" + q_string + ", fl='*, score', rows=" + str(10)) input("Press Enter to continue...") result = solr.search(q=q_string, fl='*, score', rows=10) # for r in json.dumps(result.docs): # print(r) # for r in result: # print(r['id'], r['text']) # # print(r['text']) print() print("------------------") print("| SEARCH RESULTS |") print("------------------") print() print("Saw {0} result(s).".format(len(result))) j = 0 top_10 = [] for result1 in result: j += 1 print(j) # print(result1) temp = result1['id'] art = str(temp).split("_") # print("Article : " + art[0]) # print("Sentence : " + art[1]) sen = result1['faq'] print(sen) top_10.append(sen[0]) print(result1['score']) print("-----------------------") print(str(top_10[0])) return top_10
def tag_words_string(tagged): return ' '.join([tuple2str(i) for i in tagged])
def indexingProcessTask3(csv_data): solrInstance = 'http://localhost:8983/solr/task3/' start = time.time() solr = pysolr.Solr(solrInstance) stemmer = PorterStemmer() data = [] questionid = 1 for tuple in csv_data: tokensque = word_tokenize(tuple[0].strip()) stopRemovedQue = removestop(tokensque) tokensans = word_tokenize(tuple[1].strip()) stopRemovedans = removestop(tokensans) postaggedque = pos_tag(tokensque) postaggedans = pos_tag(tokensans) list_pos_tagged_que = [tuple2str(t) for t in postaggedque] list_pos_tagged_ans = [tuple2str(t) for t in postaggedans] lemma1 = getLemmas(postaggedque) head_words_que = dependencyRel(tuple[0].strip()) head_words_ans = dependencyRel(tuple[1].strip()) extra_heads_que = set() for x in head_words_que.split(): if x in custom_syn_map: extra_heads_que.add(custom_syn_map[x]) for x in extra_heads_que: head_words_que += " " + x stem1 = [stemmer.stem(t) for t in tokensque] lemma2 = getLemmas(postaggedans) stem2 = [stemmer.stem(t) for t in tokensans] synonyms, hypernyms, hyponyms, meronyms, holonyms = getFeatures( postaggedque, tokensque) synonyms_a, hypernyms_a, hyponyms_a, meronyms_a, holonyms_a = getFeatures( postaggedans, tokensans) data.append({ 'id': str(questionid), 'faq_original': str(tuple[0]), 'faq': ' '.join(tokensque), 'stop_words_q': ' '.join(stopRemovedQue), 'pos_tag_q': ' '.join(list_pos_tagged_que), 'lemma_q': lemma1, 'stem_q': ' '.join(stem1), 'head_words_q': head_words_que, 'synonyms_q': synonyms, 'hypernyms_q': hypernyms, 'hyponyms_q': hyponyms, 'meronyms_q': meronyms, 'holonyms_q': holonyms, 'faq_ans': ' '.join(tokensans), 'stop_words_a': ' '.join(stopRemovedans), 'pos_tag_a': ' '.join(list_pos_tagged_ans), 'lemma_a': lemma2, 'stem_a': ' '.join(stem2), 'head_words_a': head_words_ans, 'synonyms_a': synonyms_a, 'hypernyms_a': hypernyms_a, 'hyponyms_a': hyponyms_a, 'meronyms_a': meronyms_a, 'holonyms_a': holonyms_a, }) questionid += 1 solr.add(data)