def __init__(self, k=1): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) # parameters self.p = 0.8 self.k = k
def __init__(self, alpha=6.0): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) self.tfidf = TFIDFmodel() # parameters self.alpha = alpha self.k = 3 self.p = 0.80
class TermWindowW2VExpansion(QueryExpansion): def __init__(self, k=1): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) # parameters self.p = 0.8 self.k = k def expand(self, query): phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower())) w2v_phrases = [phrase for phrase in phrases if phrase in self.model] translated_queries = [[] for i in range(self.k)] for idx, phrase in enumerate(w2v_phrases): # get adjacent terms prev_phrase = w2v_phrases[idx-1] if idx != 0 else u"" next_phrase = w2v_phrases[idx+1] if idx != len(w2v_phrases)-1 else u"" window = [e for e in [prev_phrase, phrase, next_phrase] if len(e) > 0] similar_phrases = self.model.inner_model.most_similar(window, topn=self.k) for i in range(self.k): translated_queries[i-1].append(similar_phrases[i-1][0]) query_strings = [" ".join(q) for q in translated_queries] combined_query = query + "." + ".".join(query_strings) return combined_query def __str__(self): return self.__class__.__name__
class TermwiseW2VExpansion(QueryExpansion): def __init__(self, k=1): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) # parameters self.p = 0.7 self.k = k def expand(self, query): phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower())) w2v_phrases = [phrase for phrase in phrases if phrase in self.model] translated_queries = [[] for i in range(self.k)] for phrase in w2v_phrases: similar_phrases = self.model.inner_model.most_similar_cosmul(phrase,topn=self.k) for i in range(self.k): translated_queries[i-1].append(similar_phrases[i-1][0]) query_strings = [" ".join(query) for query in translated_queries] combined_query = ".".join(query_strings) return combined_query def __str__(self): return self.__class__.__name__
class WeightedW2VExpansion(QueryExpansion): def __init__(self, alpha=6.0): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) self.tfidf = TFIDFmodel() # parameters self.alpha = alpha self.k = 3 self.p = 0.80 def expand(self, query): phrases = self.phrase_detector.detect( self.tokenizer.tokenize(query.lower())) w2v_phrases = [phrase for phrase in phrases if phrase in self.w2v] extra_terms = [] for phrase in w2v_phrases: idf = 0.0 if phrase in self.tfidf.dictionary.token2id: idf = self.tfidf.inner_model.idfs[ self.tfidf.dictionary.token2id[phrase]] expansion = [] if idf > self.alpha: expansion = self.w2v.inner_model.most_similar_cosmul( positive=[phrase], topn=self.k) # print phrase, idf, " ".join([e[0] for e in expansion]) extra_terms += [e[0] for e in expansion] new_query = query + " " + " ".join(extra_terms) return new_query def __str__(self): return self.__class__.__name__
class WeightedW2VExpansion(QueryExpansion): def __init__(self, alpha=6.0): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) self.tfidf = TFIDFmodel() # parameters self.alpha = alpha self.k = 3 self.p = 0.80 def expand(self, query): phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower())) w2v_phrases = [phrase for phrase in phrases if phrase in self.w2v] extra_terms = [] for phrase in w2v_phrases: idf = 0.0 if phrase in self.tfidf.dictionary.token2id: idf = self.tfidf.inner_model.idfs[self.tfidf.dictionary.token2id[phrase]] expansion = [] if idf > self.alpha: expansion = self.w2v.inner_model.most_similar_cosmul(positive=[phrase], topn=self.k) # print phrase, idf, " ".join([e[0] for e in expansion]) extra_terms += [e[0] for e in expansion] new_query = query + " " + " ".join(extra_terms) return new_query def __str__(self): return self.__class__.__name__
class TermwiseW2VExpansion(QueryExpansion): def __init__(self, k=1): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) # parameters self.p = 0.7 self.k = k def expand(self, query): phrases = self.phrase_detector.detect( self.tokenizer.tokenize(query.lower())) w2v_phrases = [phrase for phrase in phrases if phrase in self.model] translated_queries = [[] for i in range(self.k)] for phrase in w2v_phrases: similar_phrases = self.model.inner_model.most_similar_cosmul( phrase, topn=self.k) for i in range(self.k): translated_queries[i - 1].append(similar_phrases[i - 1][0]) query_strings = [" ".join(query) for query in translated_queries] combined_query = ".".join(query_strings) return combined_query def __str__(self): return self.__class__.__name__
class AverageW2VExpansion(QueryExpansion): def __init__(self, p=0.7): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) # parameters self.p = p self.n = 10 def expand(self, query): phrases = self.phrase_detector.detect( self.tokenizer.tokenize(query.lower())) w2v_phrases = [phrase for phrase in phrases if phrase in self.model] similar_phrases = self.model.inner_model.most_similar(w2v_phrases, [], topn=self.n) extra_terms = " ".join([ phrase[0].replace('_', ' ') for phrase in similar_phrases if phrase[1] > self.p ]) return "%s %s" % ( query, extra_terms, ) def __str__(self): return self.__class__.__name__
def __init__(self, size=50, modelfile=None): self.phrase_detector = PmiPhraseDetector( RawSentenceStream(fz_docs=False)) # build model epochs = 2 self.model = D2Vmodel( PhraseSentenceStream(self.phrase_detector, extract_func=extract_docid, fz_docs=True, reshuffles=epochs - 1), name="DOCID", dataset_name="CASEREPORT", epochs=epochs, dimension=size, modelfile=modelfile, ) self.doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT")
class AverageW2VExpansion(QueryExpansion): def __init__(self, p=0.7): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) # parameters self.p = p self.n = 10 def expand(self, query): phrases = self.phrase_detector.detect(self.tokenizer.tokenize(query.lower())) w2v_phrases = [phrase for phrase in phrases if phrase in self.model] similar_phrases = self.model.inner_model.most_similar(w2v_phrases, [], topn=self.n) extra_terms = " ".join([phrase[0].replace('_', ' ') for phrase in similar_phrases if phrase[1] > self.p]) return "%s %s" % (query, extra_terms, ) def __str__(self): return self.__class__.__name__
class TermWindowW2VExpansion(QueryExpansion): def __init__(self, k=1): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) # parameters self.p = 0.8 self.k = k def expand(self, query): phrases = self.phrase_detector.detect( self.tokenizer.tokenize(query.lower())) w2v_phrases = [phrase for phrase in phrases if phrase in self.model] translated_queries = [[] for i in range(self.k)] for idx, phrase in enumerate(w2v_phrases): # get adjacent terms prev_phrase = w2v_phrases[idx - 1] if idx != 0 else u"" next_phrase = w2v_phrases[ idx + 1] if idx != len(w2v_phrases) - 1 else u"" window = [ e for e in [prev_phrase, phrase, next_phrase] if len(e) > 0 ] similar_phrases = self.model.inner_model.most_similar(window, topn=self.k) for i in range(self.k): translated_queries[i - 1].append(similar_phrases[i - 1][0]) query_strings = [" ".join(q) for q in translated_queries] combined_query = query + "." + ".".join(query_strings) return combined_query def __str__(self): return self.__class__.__name__
__author__ = 'matias' from textanalysis.phrasedetection import PmiPhraseDetector from textanalysis.texts import RawSentenceStream import logging # setup logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) phrase_detector = PmiPhraseDetector(RawSentenceStream()) phrase_detector.print_phrases()
from irmodels.D2Vmodel import D2Vmodel, DocIndex from textanalysis.texts import PhraseSentenceStream, RawSentenceStream, extract_docid, extract_mesh_terms from textanalysis.phrasedetection import PmiPhraseDetector import logging from scipy.spatial.distance import cosine from textanalysis.texts import FZArticleLibrary, CaseReportLibrary from heapq import heappush, heappop import numpy as np # setup logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # phrase detector pmi_level = 90 phrase_detector = PmiPhraseDetector(RawSentenceStream(fz_docs=False), filename=str("PHRASE_%s_2_CASEREPORT_RAW" % (pmi_level, ))) # build model epochs = 2 m = D2Vmodel(PhraseSentenceStream(phrase_detector, extract_func=extract_docid, fz_docs=False, reshuffles=epochs - 1), name="DOCID", dataset_name="CASEREPORT", epochs=epochs, dimension=40) doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT") """
__author__ = 'matias' from irmodels.D2Vmodel import D2Vmodel from textanalysis.texts import PhraseSentenceStream, RawSentenceStream, ExtractDiseases from textanalysis.phrasedetection import PmiPhraseDetector from scipy.spatial.distance import cosine from heapq import heappush, heappop import numpy as np import logging # setup logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # phrase detector phrase_detector = PmiPhraseDetector(RawSentenceStream(fz_docs=False)) extract_disease = ExtractDiseases() # build model epochs = 3 m = D2Vmodel(PhraseSentenceStream(phrase_detector, extract_func=extract_disease, fz_docs=False, reshuffles=epochs - 1), name="DISEASE", dataset_name="CASEREPORT", epochs=epochs) vec_lupus = m.inner_model["man"] print np.all(np.isnan(vec_lupus))
import logging from irmodels.W2Vmodel import W2Vmodel from textanalysis.texts import PhraseSentenceStream, RawSentenceStream from textanalysis.phrasedetection import PmiPhraseDetector import numpy as np from random import sample import matplotlib.pyplot as plt from sklearn.decomposition import PCA # setup logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # phrase detector phrase_detector = PmiPhraseDetector(RawSentenceStream()) # build model m = W2Vmodel(PhraseSentenceStream(phrase_detector)) diseases = {} symptoms = {} with open("testdiseases.txt", 'r') as infile: for line in infile.read().split("\n")[:-1]: parts = line.split(",") diseases[parts[1]] = int(parts[0]) with open("testsymptoms.txt", 'r') as infile: for line in infile.read().split("\n")[:-1]: symptoms[line] = 1 # disease data keywords = diseases.keys()