def main():
    parser = argparse.ArgumentParser(description="""\
    Creates tag statistics.
    """)
    parser.add_argument("-I", "--input", required=True, help="input file")
    parser.add_argument("-O", "--output", required=True, help="output file")
    parser.add_argument("-L", "--lexicon", required=True, help="lexicon file")
    parser.add_argument("-M", "--max", help="maximum output")
    args = parser.parse_args()

    if args.input and args.output and args.lexicon:
        lexicon = json.load(open(args.lexicon))
        with codecs.open(args.output, "w", "utf-8") as out:
            wt = defaultdict(set)
            wc = Counter()
            wtc = Counter()

            for sentence in codecs.open(args.input, "r", "utf-8"):
                tokens = [str2tuple(token) for token in sentence.split()]

                for word, tag in tokens:
                    wt[word].add(tag)
                    wc[word] += 1
                    wtc[tuple2str((word, tag))] += 1

            r = {"Count": [],
                 "Words": [],
                 "Found": [],
                 "Lexicon": []}

            if args.max:
                max_num = int(args.max)
            else:
                max_num = None

            for word, count in wc.most_common(max_num):
                r["Words"].append(word)
                r["Count"].append(count)
                tg = set()

                for tag in wt[word]:
                    t = tuple2str((word, tag))
                    in_lex = ""
                    if lexicon.get(word.lower()):
                        if tag not in lexicon.get(word.lower()):
                            in_lex = "*"
                    tg.add((tag + in_lex, wtc[t]))

                tg = sorted(tg, key=lambda k: k[1], reverse=True)
                r["Found"].append(", ".join([u"{0} ({1})".format(x, y)
                                                       for x, y in tg]))
                if lexicon.get(word.lower()):
                    r["Lexicon"].append(", ".join(lexicon.get(word.lower())))
                else:
                    r["Lexicon"].append("")
            out.write(u"{0}".format(tabulate(r,
                                             headers="keys",
                                             tablefmt="pipe")))
    else:
        print parser.print_help()
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(description="""\
    Creates tag statistics.
    """)
    parser.add_argument("-I", "--input", required=True, help="input file")
    parser.add_argument("-O", "--output", required=True, help="output file")
    parser.add_argument("-L", "--lexicon", required=True, help="lexicon file")
    parser.add_argument("-M", "--max", help="maximum output")
    args = parser.parse_args()

    if args.input and args.output and args.lexicon:
        lexicon = json.load(open(args.lexicon))
        with codecs.open(args.output, "w", "utf-8") as out:
            wt = defaultdict(set)
            wc = Counter()
            wtc = Counter()

            for sentence in codecs.open(args.input, "r", "utf-8"):
                tokens = [str2tuple(token) for token in sentence.split()]

                for word, tag in tokens:
                    wt[word].add(tag)
                    wc[word] += 1
                    wtc[tuple2str((word, tag))] += 1

            r = {"Count": [], "Words": [], "Found": [], "Lexicon": []}

            if args.max:
                max_num = int(args.max)
            else:
                max_num = None

            for word, count in wc.most_common(max_num):
                r["Words"].append(word)
                r["Count"].append(count)
                tg = set()

                for tag in wt[word]:
                    t = tuple2str((word, tag))
                    in_lex = ""
                    if lexicon.get(word.lower()):
                        if tag not in lexicon.get(word.lower()):
                            in_lex = "*"
                    tg.add((tag + in_lex, wtc[t]))

                tg = sorted(tg, key=lambda k: k[1], reverse=True)
                r["Found"].append(", ".join(
                    [u"{0} ({1})".format(x, y) for x, y in tg]))
                if lexicon.get(word.lower()):
                    r["Lexicon"].append(", ".join(lexicon.get(word.lower())))
                else:
                    r["Lexicon"].append("")
            out.write(u"{0}".format(
                tabulate(r, headers="keys", tablefmt="pipe")))
    else:
        print parser.print_help()
Exemple #3
0
def deeperNLPFeatures(dir_file="./training/"):

    files = listdir(dir_file)
    stemmer = PorterStemmer()
    file_id = 0
    for f in files:
        file_id += 1
        with open(dir_file + f, 'r', encoding='ISO-8859-1') as text_file:
            text = text_file.read()
            text = text.strip().lower()
            tokens = word_tokenize(text)
            #            head_word = get_dependency_relations(text)
            tagged_tok = pos_tag(tokens)
            tagged_list = [tuple2str(t) for t in tagged_tok]
            tokens_clean = deleteStopWords(tokens)
            lemma_line = get_lemmatized_line(tagged_tok)
            stem_line = [stemmer.stem(t) for t in tokens_clean]
            synonyms, hypernyms, hyponyms, meronyms, holonymns = get_semantic_features(
                tagged_tok, tokens)
            print(dir_file + f, ' ===> \n', 'text: \n', text, '\n',
                  'tokens: \n', tokens, '\n', 'pos tag: \n', tagged_tok, '\n',
                  'remove_stopWords: \n', tokens_clean, '\n', 'lemmatized: \n',
                  lemma_line, '\n', 'stemmed: \n', stem_line, '\n',
                  'synonyms: \n', synonyms, '\n', 'hypernyms: \n', hypernyms,
                  '\n', 'hyponyms: \n', hyponyms, '\n', 'meronyms: \n',
                  meronyms, '\n', 'holonymns: \n', holonymns, '\n\n')
def fix(corpus_ud, corpus_itb, corpus_out):
    itb = map(
        lambda tagged: map(str2tuple, tagged.split()),
        io.open(corpus_itb, 'r', encoding='utf-8').read().strip().split('\n'))
    ud = map(
        lambda tagged: map(str2tuple, tagged.split()),
        io.open(corpus_ud, 'r', encoding='utf-8').read().strip().split('\n'))

    corpus = []
    for x, y in zip(ud, itb):
        sent = []
        for xx, yy in zip(x, y):
            xx, yy = list(xx), list(yy)
            if xx[1] == 'PROPN':
                yy[1] = 'E--'
            if 'X--' in yy[1] or 'F--' in yy[1]:
                if xx[1] == 'NOUN':
                    yy[1] = yy[1].replace('X--', 'NSD').replace('F--', 'NSD')
                elif xx[1] == 'VERB':
                    yy[1] = yy[1].replace('X--', 'VSA').replace('F--', 'NSD')
                elif xx[1] == 'ADJ':
                    yy[1] = yy[1].replace('X--', 'ASP').replace('F--', 'ASP')
                elif xx[1] == 'ADV':
                    yy[1] = yy[1].replace('X--', 'D--').replace('F--', 'D--')
                elif xx[1] == 'ADP':
                    yy[1] = yy[1].replace('X--', 'R--').replace('F--', 'R--')
                elif xx[1] == 'DET':
                    yy[1] = yy[1].replace('X--', 'B--').replace('F--', 'B--')
            sent.append(tuple2str(yy))
        corpus.append(sent)

    with io.open(corpus_out, 'w', encoding='utf-8') as out:
        for sent in corpus:
            out.write(' '.join(sent))
            out.write('\n')
Exemple #5
0
    def pos_tag_raw_text(self, text, as_tuple_list=True):
        # Unfortunately for the moment there is no method to do sentence split + pos tagging in nltk.parse.corenlp
        # Ony raw_tag_sents is available but assumes a list of str (so it assumes the sentence are already split)
        # We create a small custom function highly inspired from raw_tag_sents to do both

        def raw_tag_text():
            """
            Perform tokenizing sentence splitting and PosTagging and keep the 
            sentence splits structure
            """
            properties = {'annotators': 'tokenize,ssplit,pos'}
            tagged_data = self.parser.api_call(text, properties=properties)
            for tagged_sentence in tagged_data['sentences']:
                yield [(token['word'], token['pos'])
                       for token in tagged_sentence['tokens']]

        tagged_text = list(raw_tag_text())

        if as_tuple_list:
            return tagged_text
        return '[ENDSENT]'.join([
            ' '.join([
                tuple2str(tagged_token, self.separator)
                for tagged_token in sent
            ]) for sent in tagged_text
        ])
def fix(corpus_ud, corpus_itb, corpus_out):
    itb = map(lambda tagged: map(str2tuple, tagged.split()),
              codecs.open(corpus_itb, 'r', 'utf-8').read().strip().split('\n'))
    ud = map(lambda tagged: map(str2tuple, tagged.split()),
             codecs.open(corpus_ud, 'r', 'utf-8').read().strip().split('\n'))

    corpus = []
    for x, y in zip(ud, itb):
        sent = []
        for xx, yy in zip(x, y):
            xx, yy = list(xx), list(yy)
            if xx[1] == 'PROPN':
                yy[1] = 'E--'
            if 'X--' in yy[1] or 'F--' in yy[1]:
                if xx[1] == 'NOUN':
                    yy[1] = yy[1].replace('X--', 'NSD').replace('F--', 'NSD')
                elif xx[1] == 'VERB':
                    yy[1] = yy[1].replace('X--', 'VSA').replace('F--', 'NSD')
                elif xx[1] == 'ADJ':
                    yy[1] = yy[1].replace('X--', 'ASP').replace('F--', 'ASP')
                elif xx[1] == 'ADV':
                    yy[1] = yy[1].replace('X--', 'D--').replace('F--', 'D--')
                elif xx[1] == 'ADP':
                    yy[1] = yy[1].replace('X--', 'R--').replace('F--', 'R--')
                elif xx[1] == 'DET':
                    yy[1] = yy[1].replace('X--', 'B--').replace('F--', 'B--')
            sent.append(tuple2str(yy))
        corpus.append(sent)

    with codecs.open(corpus_out, 'w', 'utf-8') as out:
        for sent in corpus:
            out.write(' '.join(sent))
            out.write('\n')
Exemple #7
0
def tokenize(file_path):
    '''
    Helper function to preprocess and tokenize articles.

    1) Lowercase
    2) Tokenize using Punkt tokenizer
    3) Part-of-speech tag using Averaged Perceptron tagger
    4) Lemmatize using WordNet lemmatizer
    5) Filter out stopwords

    Parameters
    ----------
    file_path : string
        Path to article to be tokenized.
    '''
    tokens = []
    stopwords = set(nltk.corpus.stopwords.words('english'))
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    with open(file_path, 'r') as f:
        data = f.read().lower()

    for sent in nltk.sent_tokenize(data):
        for word in nltk.word_tokenize(sent):
            tokens += [word]

    tagged_tokens = nltk.pos_tag(tokens)

    if len(tagged_tokens) <= 100:
        # Assume that this document is from corpus 2
        # Strip proper nouns and cardinal numbers
        tokens = [
            tuple2str((lemmatizer.lemmatize(token, penn_to_wordnet(tag)), tag))
            for token, tag in tagged_tokens
            if (token not in stopwords and tag not in ['NNP', 'NNPS', 'CD'])
        ]
    else:
        tokens = [
            tuple2str((lemmatizer.lemmatize(token, penn_to_wordnet(tag)), tag))
            for token, tag in tagged_tokens if token not in stopwords
        ]

    return tokens
    def pos_tag_raw_text(self, text, as_tuple_list=True):
        """
        Implementation of abstract method from PosTagging
        @see PosTagging
        """
        tagged_text = self.tagger.tag_sents([self.sent_tokenizer.sentences_from_text(text)])

        if as_tuple_list:
            return tagged_text
        return '[ENDSENT]'.join(
            [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text])
	def chunked_sent_string(self, sent):
		parts = []
		
		for word, tag in sent:
			try:
				brack = word in u'[]'
			except:
				brack = False
			
			if brack:
				# brackets don't get a tag
				parts.append(word)
			else:
				# make sure no brackets or slashes in tag
				tag = tag.replace(u'[', u'(').replace(u']', u')').replace(u'/', '|')
				parts.append(tuple2str((word, tag)))
		
		return ' '.join(parts)
def createIndex(instance_url='http://localhost:8983/solr/collection_1/',
                dir_file="./training/"):
    #def indexer( dir_file = "./training/"):
    '''
    load the solr instace and index from the dir_file
    '''
    solr = pysolr.Solr(instance_url)
    files = listdir(dir_file)
    data = []
    stemmer = PorterStemmer()
    file_id = 0
    for f in files:
        file_id += 1
        with open(dir_file + f, 'r', encoding='ISO-8859-1') as text_file:
            text = text_file.read()
            text = text.strip().lower()
            tokens = word_tokenize(text)
            #            head_word = get_dependency_relations(text)
            tagged_tok = pos_tag(tokens)
            tagged_list = [tuple2str(t) for t in tagged_tok]
            tokens_clean = deleteStopWords(tokens)
            lemma_line = get_lemmatized_line(tagged_tok)
            stem_line = [stemmer.stem(t) for t in tokens_clean]
            synonyms, hypernyms, hyponyms, meronyms, holonymns = get_semantic_features(
                tagged_tok, tokens)
            data.append({
                'id': f + '_' + str(file_id),
                'text': ' '.join(tokens),
                'pos_tag': ' '.join(tagged_list),
                'text_clean': ' '.join(tokens_clean),
                'lemmas': lemma_line,
                'stems': ' '.join(stem_line),
                'synonyms': synonyms,
                'hypernyms': hypernyms,
                'hyponyms': hyponyms,
                'meronyms': meronyms,
                'holonymns': holonymns,
                #                'head_word': head_word,
            })


#    print(data)
    if data:
        solr.add(data)
    def chunked_sent_string(self, sent):
        parts = []

        for word, tag in sent:
            try:
                brack = word in u'[]'
            except:
                brack = False

            if brack:
                # brackets don't get a tag
                parts.append(word)
            else:
                # make sure no brackets or slashes in tag
                tag = tag.replace(u'[', u'(').replace(u']',
                                                      u')').replace(u'/', '|')
                parts.append(tuple2str((word, tag)))

        return ' '.join(parts)
    def pos_tag_raw_text(self, text, as_tuple_list=True):
        # Unfortunately for the moment there is no method to do sentence split + pos tagging in nltk.parse.corenlp
        # Ony raw_tag_sents is available but assumes a list of str (so it assumes the sentence are already split)
        # We create a small custom function highly inspired from raw_tag_sents to do both

        parsed_text = self.parser(text)
        sentences = parsed_text.sentences

        def raw_tag_text():
            """
            Perform tokenizing sentence splitting and PosTagging and keep the 
            sentence splits structure
            """
            for tagged_sentence in sentences:
                yield [(token.text, token.xpos) 
                       for token in tagged_sentence.words]
        
        tagged_text = list(raw_tag_text())        

        if as_tuple_list:
            return tagged_text, sentences
        return '[ENDSENT]'.join(
            [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text]), sentences
Exemple #13
0
def match(query, method, instance_url):
    '''
    search solr index using user query
    '''
    q_list = []
    q_string = ''
    solr = pysolr.Solr(instance_url)

    tokens = word_tokenize(query)
    stemmer = PorterStemmer()
    tagged_tokens = pos_tag(tokens)
    tagged_list = [tuple2str(t) for t in tagged_tokens]
    tokens_clean = deleteStopWords(tokens)
    lemmas = get_lemmatized_line(tagged_tokens)
    stem_line = [stemmer.stem(t) for t in tokens_clean]
    synonyms, hypernyms, hyponyms, meronyms, holonyms = get_semantic_features(
        tagged_tokens, tokens)
    #        head_words = get_dependency_relations(lemmas, True)
    print('user input features', ' ===> \n', 'text: \n', query, '\n',
          'tokens: \n', tokens, '\n', 'pos tag: \n', tagged_tokens, '\n',
          'remove_stopWords: \n', tokens_clean, '\n', 'lemmatized: \n', lemmas,
          '\n', 'stemmed: \n', stem_line, '\n', 'synonyms: \n', synonyms, '\n',
          'hypernyms: \n', hypernyms, '\n', 'hyponyms: \n', hyponyms, '\n',
          'meronyms: \n', meronyms, '\n', 'holonymns: \n', holonyms, '\n\n')

    pos_tag_data = '&'.join(tagged_list)
    lemmas = '&'.join(lemmas.split())
    stems = '&'.join(stem_line)
    synonyms = '&'.join(synonyms.split())
    hypernyms = '&'.join(hypernyms.split())
    hyponyms = '&'.join(hyponyms.split())
    holonyms = '&'.join(holonyms.split())
    meronyms = '&'.join(meronyms.split())

    if method == 3:
        #        head_words = '&'.join(head_words.split())
        if tokens:
            q_list.append('text:' + '&'.join(tokens))
        if tokens_clean:
            q_list.append('text_clean:' + '&'.join(tokens_clean))
        if pos_tag_data:
            q_list.append('pos_tag:' + pos_tag_data)
        if lemmas:
            q_list.append('lemmas:' + lemmas)
        if stems:
            q_list.append('stems:' + stems)
        if synonyms:
            q_list.append('synonyms:' + synonyms)
        if hypernyms:
            q_list.append('hypernyms:' + hypernyms)
        if hyponyms:
            q_list.append('hyponyms:' + hyponyms)
        if meronyms:
            q_list.append('meronyms:' + meronyms)
        if holonyms:
            q_list.append('holonymns:' + holonyms)
#            if head_words:
#                q_list.append('head_word:' + head_words)
    if method == 4:
        if tokens:
            q_list.append('text:' + '&'.join(tokens) + '^0.5')
        if pos_tag_data:
            q_list.append('pos_tag:' + pos_tag_data + '^0.02')
        if lemmas:
            q_list.append('lemmas:' + lemmas + '^4')
        if tokens_clean:
            q_list.append('text_clean:' + '&'.join(tokens_clean) + '^5')
        if stems:
            q_list.append('stems:' + stems + '^1.5')
        if synonyms:
            q_list.append('synonyms:' + synonyms + '5')
        if hypernyms:
            q_list.append('hypernyms:' + hypernyms + '^5')


#       if hyponyms:
#                 q_list.append('hyponyms:' + hyponyms + '^4')
#         if head_words:
#                q_list.append('head_word:' + head_words + '^0.5')
#         if meronyms:
#                 q_list.append('meronyms:' + meronyms + '^1.4')
#         if holonyms:
#                 q_list.append('holonymns:' + holonyms + '^1.4')

    q_string = ', '.join(q_list)
    print('The Solr query is q=%s, fl=\'id,text\'\n' % (q_string))
    result = solr.search(q=q_string, fl='id,text')
    for r in result:
        print(r['id'])
        print(' '.join(r['text']))
Exemple #14
0
import nltk
taggedtok = ('bear', 'NN')
from nltk.tag.util import tuple2str
print(tuple2str(taggedtok))
def test(test_file, output_file, transition_probabilities,
         emission_probabilities):
    """
    Performs part-of-speech tagging on each word in a sentence in the test file using Viterbi algorithm.

    Probabilities are calculated on a logarithmic scale to avoid problems associated with multiplying small floating
    point numbers.

    For cases where emission probabilities of a token given a tag were not encountered in training data, the emission
    probability is treated as 0. As probabilities are being calculated on a logarithmic scale, and log(0) is undefined,
    log(float_info.min) is used as a substitute, representing the logarithmic value as it approaches 0.

    Side effects of substituting "0" probabilities with log(float_info.min):
    - in cases where there are no unknown tokens (defined as tokens not encountered in the tagged training corpus),
    log(float_info.min) serves as a heavy penalty to a path's probability where a "0" probability occurs, relative to
    other paths without a "0" probability. When considering the best paths for a token's tag types, a path with
    log(float_info.min) in its probability will effectively fare poorer in comparisons with paths without such "0"
    probabilities, as log(float_info.min) will be several magnitudes smaller than the smallest probabilities.
    - in cases where there are unknown tokens, every emission probability related to that token given a tag type will be
    "0" (i.e. log(float_info.min)). As path probabilities are being calculated on a logarithmic scale, probabilities are
    being added and not multiplied, thus encountering a single unknown token in a sequence will not render all paths in
    the trellis 0. Instead, log(float_info.min) would be a flat penalty across all paths, thus effectively cancelling
    each other out in comparisons. In which case where unknown tokens are encountered, emission probabilities of that
    token are not considered. Rather, only the tag types from the previous token, and the transition probability between
    tag types play a part in determining the best paths.

    Viterbi nodes are initialized to -float_info.max such that in comparisons to determine the best paths, it will be
    replaced by even the worst of path probabilities, as -float_info.max is a googol googol googol (that's 300 zeroes)
    magnitudes smaller than log(float_info.min). Even if a sentence contains several unknown tokens, the resulting path
    probabilities would still be larger than -float_info.max.

    :param test_file: the file containing sentences to perform part-of-speech tagging on
    :param output_file: the file to write the part-of-speech tagged sentences to
    :param transition_probabilities: transition probabilities of a trained hidden Markov model, on a logarithmic scale
    :param emission_probabilities: emission probabilities of a trained hidden Markov model, on a logarithmic scale
    """
    with open(test_file, "r") as test_data, open(output_file, "w") as output:
        for line in tqdm(test_data, total=rawcount(test_file),
                         desc="Testing "):

            tokens = tuple(line.split())
            tag_types = list(load('help/tagsets/upenn_tagset.pickle').keys()
                             ) + ["-LRB-", "-RRB-", "#"]
            tag_types = [x for x in tag_types if x not in ["(", ")", "--"]
                         ]  # The tagset in nltk uses different notations

            # Initialize required arrays to model the Viterbi trellis for given test input.
            # The viterbi array keeps track of the best probability path to a token's tag type from the previous token.
            # For each best path, the backpointer array keeps track of the tag type in the previous token.
            viterbi = np.full((len(tokens), len(tag_types)), -float_info.max)
            backpointer = np.full((len(tokens), len(tag_types)),
                                  -1,
                                  dtype=np.int)

            # Initialize paths in trellis from start to tag types (states) corresponding to first token (observation)
            for t_index, tag in enumerate(tag_types):
                viterbi[0][t_index] = transition_probabilities[("START", tag)] \
                                      + emission_probabilities.get((tokens[0], tag), log(float_info.min))

            # Iteratively fill out Viterbi path probabilities between tag types of each token and the tag types of the
            # token immediately preceding it in the sequence
            if len(tokens) > 1:
                for token_index, (prev_token,
                                  curr_token) in enumerate(pairwise(tokens)):
                    for ctag_index, curr_tag in enumerate(tag_types):
                        for ptag_index, prev_tag in enumerate(tag_types):
                            temp_viterbi = viterbi[token_index][ptag_index] \
                                          + transition_probabilities[(prev_tag, curr_tag)] \
                                          + emission_probabilities.get((curr_token, curr_tag), log(float_info.min))

                            if temp_viterbi >= viterbi[token_index +
                                                       1][ctag_index]:
                                viterbi[token_index +
                                        1][ctag_index] = temp_viterbi
                                backpointer[token_index +
                                            1][ctag_index] = ptag_index

            # Determine the best terminating path from the last token
            last_token_index = len(tokens) - 1
            end_viterbi = -float_info.max
            end_backpointer = -1
            for tag_index, prev_tag in enumerate(tag_types):
                temp_viterbi = viterbi[last_token_index][tag_index] \
                               + transition_probabilities[(prev_tag, "END")]

                if temp_viterbi >= end_viterbi:
                    end_viterbi = temp_viterbi
                    end_backpointer = tag_index

            # Perform Viterbi backtrace, finding the most likely tag type sequence through best paths to the beginning
            likeliest_tag_indexes = [-1] * len(tokens)
            likeliest_tag_indexes[-1] = end_backpointer
            for token_index in reversed(range(len(tokens) - 1)):
                likeliest_tag_indexes[token_index] = backpointer[
                    token_index + 1][likeliest_tag_indexes[token_index + 1]]

            # Formatting output
            likeliest_tags = [
                tag_types[index] for index in likeliest_tag_indexes
            ]
            pos_tagged_line = ' '.join([
                tuple2str(tagged_token)
                for tagged_token in list(zip(tokens, likeliest_tags))
            ])

            output.write(pos_tagged_line + "\n")
Exemple #16
0
def searchintask4_default(query):
    solrInstance = 'http://localhost:8983/solr/task3/'
    solr = pysolr.Solr(solrInstance)
    stemmer = PorterStemmer()
    tokensTaggedTest = pos_tag(query)
    head_words = dependencyRel(' '.join(query))
    synonymsTest, hypernymsTest, hyponymsTest, meronymsTest, holonymsTest = getFeatures(
        tokensTaggedTest, query)
    lemmasTest = getLemmas(tokensTaggedTest)
    stem11 = [stemmer.stem(t) for t in query]
    stemTest = ' '.join(stem11)
    listTagged = [tuple2str(t) for t in tokensTaggedTest]

    posData = ' '.join(listTagged)
    posData = '&'.join(posData.split())
    lemmasTest = '&'.join(lemmasTest.split())
    stemTest = '&'.join(stemTest.split())
    synonymsTest = '&'.join(synonymsTest.split())
    hypernymsTest = '&'.join(hypernymsTest.split())
    hyponymsTest = '&'.join(hyponymsTest.split())
    holonymsTest = '&'.join(holonymsTest.split())
    meronymsTest = '&'.join(meronymsTest.split())
    head_words = '&'.join(head_words.split())
    q_list = []
    if query:
        q_list.append('faq:' + ' '.join(query) + '^1.8')
        q_list.append('faq_ans' + ' '.join(query) + '^0.8')
    if posData:
        q_list.append('pos_tag_q:' + posData + '^0.02')
        q_list.append('pos_tag_a:' + posData + '^0.001')
    if lemmasTest:
        q_list.append('lemma_q:' + lemmasTest + '^2.0')
        q_list.append('lemma_a:' + lemmasTest + '^1.0')
    # if stemTest:
    #     q_list.append('stem:' + stemTest + '^1.5')
    if synonymsTest:
        q_list.append('synonyms_q:' + synonymsTest + '^3.0')
        q_list.append('synonyms_a:' + synonymsTest + '^1.5')
    if hypernymsTest:
        q_list.append('hypernyms_q:' + hypernymsTest + '^4.0')
        q_list.append('hypernyms_a:' + hypernymsTest + '^3.5')
    if head_words:
        q_list.append('head_words_q:' + head_words + '^3.0')
        q_list.append('head_words_a:' + head_words + '^2.0')
    if hyponymsTest:
        q_list.append('hyponyms_q:' + hyponymsTest + '^0.24')
        q_list.append('hyponyms_a:' + hyponymsTest + '^0.14')
    if meronymsTest:
        q_list.append('meronyms_q:' + meronymsTest + '^0.14')
        q_list.append('meronyms_a:' + meronymsTest + '^0.10')
    if holonymsTest:
        q_list.append('holonyms_q:' + holonymsTest + '^0.14')
        q_list.append('holonyms_a:' + holonymsTest + '^0.10')
    # print(','.join(q_list))
    q_string = ', '.join(q_list)
    print("Query is: ")
    print("q=" + q_string + ", fl='*, score', rows=" + str(10))
    input("Press Enter to continue...")

    result = solr.search(q=q_string, fl='*, score', rows=10)
    # for r in json.dumps(result.docs):
    #     print(r)
    # for r in result:
    #     print(r['id'], r['text'])
    #     # print(r['text'])
    print()
    print("------------------")
    print("| SEARCH RESULTS |")
    print("------------------")
    print()
    print("Saw {0} result(s).".format(len(result)))
    j = 0
    top_10 = []
    for result1 in result:
        j += 1
        print(j)
        # print(result1)
        temp = result1['id']
        art = str(temp).split("_")
        # print("Article : " + art[0])
        # print("Sentence : " + art[1])
        sen = result1['faq']
        print(sen)
        top_10.append(sen[0])
        print(result1['score'])
        print("-----------------------")
    print(str(top_10[0]))
    return top_10
Exemple #17
0
def tag_words_string(tagged):
	return ' '.join([tuple2str(i) for i in tagged])
Exemple #18
0
def indexingProcessTask3(csv_data):
    solrInstance = 'http://localhost:8983/solr/task3/'
    start = time.time()
    solr = pysolr.Solr(solrInstance)
    stemmer = PorterStemmer()
    data = []
    questionid = 1
    for tuple in csv_data:
        tokensque = word_tokenize(tuple[0].strip())
        stopRemovedQue = removestop(tokensque)
        tokensans = word_tokenize(tuple[1].strip())
        stopRemovedans = removestop(tokensans)
        postaggedque = pos_tag(tokensque)
        postaggedans = pos_tag(tokensans)
        list_pos_tagged_que = [tuple2str(t) for t in postaggedque]
        list_pos_tagged_ans = [tuple2str(t) for t in postaggedans]
        lemma1 = getLemmas(postaggedque)
        head_words_que = dependencyRel(tuple[0].strip())
        head_words_ans = dependencyRel(tuple[1].strip())
        extra_heads_que = set()
        for x in head_words_que.split():
            if x in custom_syn_map:
                extra_heads_que.add(custom_syn_map[x])
        for x in extra_heads_que:
            head_words_que += " " + x

        stem1 = [stemmer.stem(t) for t in tokensque]
        lemma2 = getLemmas(postaggedans)
        stem2 = [stemmer.stem(t) for t in tokensans]
        synonyms, hypernyms, hyponyms, meronyms, holonyms = getFeatures(
            postaggedque, tokensque)
        synonyms_a, hypernyms_a, hyponyms_a, meronyms_a, holonyms_a = getFeatures(
            postaggedans, tokensans)
        data.append({
            'id': str(questionid),
            'faq_original': str(tuple[0]),
            'faq': ' '.join(tokensque),
            'stop_words_q': ' '.join(stopRemovedQue),
            'pos_tag_q': ' '.join(list_pos_tagged_que),
            'lemma_q': lemma1,
            'stem_q': ' '.join(stem1),
            'head_words_q': head_words_que,
            'synonyms_q': synonyms,
            'hypernyms_q': hypernyms,
            'hyponyms_q': hyponyms,
            'meronyms_q': meronyms,
            'holonyms_q': holonyms,
            'faq_ans': ' '.join(tokensans),
            'stop_words_a': ' '.join(stopRemovedans),
            'pos_tag_a': ' '.join(list_pos_tagged_ans),
            'lemma_a': lemma2,
            'stem_a': ' '.join(stem2),
            'head_words_a': head_words_ans,
            'synonyms_a': synonyms_a,
            'hypernyms_a': hypernyms_a,
            'hyponyms_a': hyponyms_a,
            'meronyms_a': meronyms_a,
            'holonyms_a': holonyms_a,
        })
        questionid += 1
    solr.add(data)