def __init__(self, k=1): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.model = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) # parameters self.p = 0.8 self.k = k
def __init__(self, alpha=6.0): # phrase detector self.phrase_detector = PmiPhraseDetector(RawSentenceStream()) # number converter self.tokenizer = RawTokenizer() # build model self.w2v = W2Vmodel(PhraseSentenceStream(self.phrase_detector)) self.tfidf = TFIDFmodel() # parameters self.alpha = alpha self.k = 3 self.p = 0.80
def __init__(self): self.w2v = W2Vmodel() xs = [] ys = [] with open('data/misc/domain_classify.txt', 'r') as infile: lines = infile.read().split("\n") for line in lines: word, y = line.split(',') word_vec = self.w2v.inner_model[word] xs.append(word_vec) ys.append(y) self.clf = svm.SVC() self.clf.fit(xs, ys)
import logging from irmodels.W2Vmodel import W2Vmodel from textanalysis.texts import PhraseSentenceStream, RawSentenceStream from textanalysis.phrasedetection import PmiPhraseDetector import numpy as np from random import sample import matplotlib.pyplot as plt from sklearn.decomposition import PCA # setup logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # phrase detector phrase_detector = PmiPhraseDetector(RawSentenceStream()) # build model m = W2Vmodel(PhraseSentenceStream(phrase_detector)) diseases = {} symptoms = {} with open("testdiseases.txt", 'r') as infile: for line in infile.read().split("\n")[:-1]: parts = line.split(",") diseases[parts[1]] = int(parts[0]) with open("testsymptoms.txt", 'r') as infile: for line in infile.read().split("\n")[:-1]: symptoms[line] = 1 # disease data keywords = diseases.keys()