def __init__(self, k=1): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) # parameters self.p = 0.8 self.k = k
def __init__(self, alpha=6.0): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) self.tfidf = TFIDFmodel() # parameters self.alpha = alpha self.k = 3 self.p = 0.80
def __init__(self, size=50, modelfile=None): self.phrase_detector = PmiPhraseDetector( RawSentenceStream(fz_docs=False)) # build model epochs = 2 self.model = D2Vmodel( PhraseSentenceStream(self.phrase_detector, extract_func=extract_docid, fz_docs=True, reshuffles=epochs - 1), name="DOCID", dataset_name="CASEREPORT", epochs=epochs, dimension=size, modelfile=modelfile, ) self.doc_index = DocIndex(CaseReportLibrary(), "CASEREPORT")
import logging from irmodels.W2Vmodel import W2Vmodel from textanalysis.texts import PhraseSentenceStream, RawSentenceStream from textanalysis.phrasedetection import PmiPhraseDetector import numpy as np from random import sample import matplotlib.pyplot as plt from sklearn.decomposition import PCA # setup logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # phrase detector phrase_detector = PmiPhraseDetector(RawSentenceStream()) # build model m = W2Vmodel(PhraseSentenceStream(phrase_detector)) diseases = {} symptoms = {} with open("testdiseases.txt", 'r') as infile: for line in infile.read().split("\n")[:-1]: parts = line.split(",") diseases[parts[1]] = int(parts[0]) with open("testsymptoms.txt", 'r') as infile: for line in infile.read().split("\n")[:-1]: symptoms[line] = 1 # disease data keywords = diseases.keys()
from heapq import heappush, heappop import numpy as np import logging # setup logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # phrase detector phrase_detector = PmiPhraseDetector(RawSentenceStream(fz_docs=False)) extract_disease = ExtractDiseases() # build model epochs = 3 m = D2Vmodel(PhraseSentenceStream(phrase_detector, extract_func=extract_disease, fz_docs=False, reshuffles=epochs - 1), name="DISEASE", dataset_name="CASEREPORT", epochs=epochs) vec_lupus = m.inner_model["man"] print np.all(np.isnan(vec_lupus)) disease_count = len( [word for word in m.inner_model.vocab if word.startswith("DISEASE-")]) not_nans = [ word for word in m.inner_model.vocab if not np.all(np.isnan(m.inner_model[word])) ] print not_nans