Esempio n. 1
0
 def __init__(self, k=1):
     # phrase detector
     self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
     # number converter
     self.tokenizer = RawTokenizer()
     # build model
     self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
     # parameters
     self.p = 0.8
     self.k = k
Esempio n. 2
0
 def __init__(self, alpha=6.0):
     # phrase detector
     self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
     # number converter
     self.tokenizer = RawTokenizer()
     # build model
     self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
     self.tfidf = TFIDFmodel()
     # parameters
     self.alpha = alpha
     self.k = 3
     self.p = 0.80
Esempio n. 3
0
class TermWindowW2VExpansion(QueryExpansion):

    def __init__(self, k=1):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = 0.8
        self.k = k

    def expand(self, query):
        phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        translated_queries = [[] for i in range(self.k)]
        for idx, phrase in enumerate(w2v_phrases):
            # get adjacent terms
            prev_phrase = w2v_phrases[idx-1] if idx != 0 else u""
            next_phrase = w2v_phrases[idx+1] if idx != len(w2v_phrases)-1 else u""
            window = [e for e in [prev_phrase, phrase, next_phrase] if len(e) > 0]
            similar_phrases = self.model.inner_model.most_similar(window, topn=self.k)
            for i in range(self.k):
                translated_queries[i-1].append(similar_phrases[i-1][0])
        query_strings = [" ".join(q) for q in translated_queries]

        combined_query = query + "." + ".".join(query_strings)
        return combined_query

    def __str__(self):
        return self.__class__.__name__
Esempio n. 4
0
class TermwiseW2VExpansion(QueryExpansion):

    def __init__(self, k=1):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = 0.7
        self.k = k

    def expand(self, query):
        phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        translated_queries = [[] for i in range(self.k)]
        for phrase in w2v_phrases:
            similar_phrases = self.model.inner_model.most_similar_cosmul(phrase,topn=self.k)
            for i in range(self.k):
                translated_queries[i-1].append(similar_phrases[i-1][0])
        query_strings = [" ".join(query) for query in translated_queries]
        combined_query = ".".join(query_strings)
        return combined_query

    def __str__(self):
        return self.__class__.__name__
Esempio n. 5
0
class WeightedW2VExpansion(QueryExpansion):
    def __init__(self, alpha=6.0):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        self.tfidf = TFIDFmodel()
        # parameters
        self.alpha = alpha
        self.k = 3
        self.p = 0.80

    def expand(self, query):
        phrases = self.phrase_detector.detect(
            self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.w2v]
        extra_terms = []
        for phrase in w2v_phrases:
            idf = 0.0
            if phrase in self.tfidf.dictionary.token2id:
                idf = self.tfidf.inner_model.idfs[
                    self.tfidf.dictionary.token2id[phrase]]
            expansion = []
            if idf > self.alpha:
                expansion = self.w2v.inner_model.most_similar_cosmul(
                    positive=[phrase], topn=self.k)
            # print phrase, idf, " ".join([e[0] for e in expansion])
            extra_terms += [e[0] for e in expansion]
        new_query = query + " " + " ".join(extra_terms)
        return new_query

    def __str__(self):
        return self.__class__.__name__
Esempio n. 6
0
class WeightedW2VExpansion(QueryExpansion):

    def __init__(self, alpha=6.0):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        self.tfidf = TFIDFmodel()
        # parameters
        self.alpha = alpha
        self.k = 3
        self.p = 0.80

    def expand(self, query):
        phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.w2v]
        extra_terms = []
        for phrase in w2v_phrases:
            idf = 0.0
            if phrase in self.tfidf.dictionary.token2id:
                idf = self.tfidf.inner_model.idfs[self.tfidf.dictionary.token2id[phrase]]
            expansion = []
            if idf > self.alpha:
                expansion = self.w2v.inner_model.most_similar_cosmul(positive=[phrase], topn=self.k)
            # print phrase, idf, " ".join([e[0] for e in expansion])
            extra_terms += [e[0] for e in expansion]
        new_query = query + " " + " ".join(extra_terms)
        return new_query

    def __str__(self):
        return self.__class__.__name__
Esempio n. 7
0
class TermwiseW2VExpansion(QueryExpansion):
    def __init__(self, k=1):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = 0.7
        self.k = k

    def expand(self, query):
        phrases = self.phrase_detector.detect(
            self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        translated_queries = [[] for i in range(self.k)]
        for phrase in w2v_phrases:
            similar_phrases = self.model.inner_model.most_similar_cosmul(
                phrase, topn=self.k)
            for i in range(self.k):
                translated_queries[i - 1].append(similar_phrases[i - 1][0])
        query_strings = [" ".join(query) for query in translated_queries]
        combined_query = ".".join(query_strings)
        return combined_query

    def __str__(self):
        return self.__class__.__name__
Esempio n. 8
0
class AverageW2VExpansion(QueryExpansion):
    def __init__(self, p=0.7):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = p
        self.n = 10

    def expand(self, query):
        phrases = self.phrase_detector.detect(
            self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        similar_phrases = self.model.inner_model.most_similar(w2v_phrases, [],
                                                              topn=self.n)
        extra_terms = " ".join([
            phrase[0].replace('_', ' ') for phrase in similar_phrases
            if phrase[1] > self.p
        ])
        return "%s %s" % (
            query,
            extra_terms,
        )

    def __str__(self):
        return self.__class__.__name__
Esempio n. 9
0
 def __init__(self, k=1):
     # phrase detector
     self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
     # number converter
     self.tokenizer = RawTokenizer()
     # build model
     self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
     # parameters
     self.p = 0.8
     self.k = k
Esempio n. 10
0
 def __init__(self, alpha=6.0):
     # phrase detector
     self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
     # number converter
     self.tokenizer = RawTokenizer()
     # build model
     self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
     self.tfidf = TFIDFmodel()
     # parameters
     self.alpha = alpha
     self.k = 3
     self.p = 0.80
Esempio n. 11
0
 def __init__(self, size=50, modelfile=None):
     self.phrase_detector = PmiPhraseDetector(
         RawSentenceStream(fz_docs=False))
     # build model
     epochs = 2
     self.model = D2Vmodel(
         PhraseSentenceStream(self.phrase_detector,
                              extract_func=extract_docid,
                              fz_docs=True,
                              reshuffles=epochs - 1),
         name="DOCID",
         dataset_name="CASEREPORT",
         epochs=epochs,
         dimension=size,
         modelfile=modelfile,
     )
     self.doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT")
Esempio n. 12
0
class AverageW2VExpansion(QueryExpansion):

    def __init__(self, p=0.7):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = p
        self.n = 10

    def expand(self, query):
        phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        similar_phrases = self.model.inner_model.most_similar(w2v_phrases, [], topn=self.n)
        extra_terms = " ".join([phrase[0].replace('_', ' ') for phrase in similar_phrases if phrase[1] > self.p])
        return "%s %s" % (query, extra_terms, )

    def __str__(self):
        return self.__class__.__name__
Esempio n. 13
0
class TermWindowW2VExpansion(QueryExpansion):
    def __init__(self, k=1):
        # phrase detector
        self.phrase_detector = PmiPhraseDetector(RawSentenceStream())
        # number converter
        self.tokenizer = RawTokenizer()
        # build model
        self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector))
        # parameters
        self.p = 0.8
        self.k = k

    def expand(self, query):
        phrases = self.phrase_detector.detect(
            self.tokenizer.tokenize(query.lower()))
        w2v_phrases = [phrase for phrase in phrases if phrase in self.model]
        translated_queries = [[] for i in range(self.k)]
        for idx, phrase in enumerate(w2v_phrases):
            # get adjacent terms
            prev_phrase = w2v_phrases[idx - 1] if idx != 0 else u""
            next_phrase = w2v_phrases[
                idx + 1] if idx != len(w2v_phrases) - 1 else u""
            window = [
                e for e in [prev_phrase, phrase, next_phrase] if len(e) > 0
            ]
            similar_phrases = self.model.inner_model.most_similar(window,
                                                                  topn=self.k)
            for i in range(self.k):
                translated_queries[i - 1].append(similar_phrases[i - 1][0])
        query_strings = [" ".join(q) for q in translated_queries]

        combined_query = query + "." + ".".join(query_strings)
        return combined_query

    def __str__(self):
        return self.__class__.__name__
Esempio n. 14
0
__author__ = 'matias'

from textanalysis.phrasedetection import PmiPhraseDetector
from textanalysis.texts import RawSentenceStream
import logging

# setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

phrase_detector = PmiPhraseDetector(RawSentenceStream())

phrase_detector.print_phrases()
Esempio n. 15
0
from irmodels.D2Vmodel import D2Vmodel, DocIndex
from textanalysis.texts import PhraseSentenceStream, RawSentenceStream, extract_docid, extract_mesh_terms
from textanalysis.phrasedetection import PmiPhraseDetector
import logging
from scipy.spatial.distance import cosine
from textanalysis.texts import FZArticleLibrary, CaseReportLibrary
from heapq import heappush, heappop
import numpy as np

# setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
# phrase detector
pmi_level = 90
phrase_detector = PmiPhraseDetector(RawSentenceStream(fz_docs=False),
                                    filename=str("PHRASE_%s_2_CASEREPORT_RAW" %
                                                 (pmi_level, )))

# build model
epochs = 2
m = D2Vmodel(PhraseSentenceStream(phrase_detector,
                                  extract_func=extract_docid,
                                  fz_docs=False,
                                  reshuffles=epochs - 1),
             name="DOCID",
             dataset_name="CASEREPORT",
             epochs=epochs,
             dimension=40)

doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT")
"""
Esempio n. 16
0
__author__ = 'matias'

from irmodels.D2Vmodel import D2Vmodel
from textanalysis.texts import PhraseSentenceStream, RawSentenceStream, ExtractDiseases
from textanalysis.phrasedetection import PmiPhraseDetector
from scipy.spatial.distance import cosine
from heapq import heappush, heappop
import numpy as np
import logging

# setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
# phrase detector
phrase_detector = PmiPhraseDetector(RawSentenceStream(fz_docs=False))

extract_disease = ExtractDiseases()

# build model
epochs = 3
m = D2Vmodel(PhraseSentenceStream(phrase_detector,
                                  extract_func=extract_disease,
                                  fz_docs=False,
                                  reshuffles=epochs - 1),
             name="DISEASE",
             dataset_name="CASEREPORT",
             epochs=epochs)

vec_lupus = m.inner_model["man"]
print np.all(np.isnan(vec_lupus))
import logging
from irmodels.W2Vmodel import W2Vmodel
from textanalysis.texts import PhraseSentenceStream, RawSentenceStream
from textanalysis.phrasedetection import PmiPhraseDetector
import numpy as np
from random import sample
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
# phrase detector
phrase_detector = PmiPhraseDetector(RawSentenceStream())
# build model
m = W2Vmodel(PhraseSentenceStream(phrase_detector))

diseases = {}
symptoms = {}

with open("testdiseases.txt", 'r') as infile:
    for line in infile.read().split("\n")[:-1]:
        parts = line.split(",")
        diseases[parts[1]] = int(parts[0])

with open("testsymptoms.txt", 'r') as infile:
    for line in infile.read().split("\n")[:-1]:
        symptoms[line] = 1

# disease data
keywords = diseases.keys()
Esempio n. 18
0
__author__ = 'matias'

from textanalysis.phrasedetection import PmiPhraseDetector
from textanalysis.texts import RawSentenceStream
import logging

# setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

phrase_detector = PmiPhraseDetector(RawSentenceStream())

phrase_detector.print_phrases()