def convertFiles2TextIter(in_dir,label): index = 0 out_dir = '' for file1 in os.listdir(in_dir): filePath = os.path.join(in_dir, file1) if os.path.isfile(filePath): index += 1 cvfile = filePath cvparser = CVParser(cvfile,label,out_dir) if cvparser.errorMsg: print cvparser.errorMsg sys.exit(0) text = cvparser.preprocess(index) text = cleanse_data(text) md5_str = getMD5HashDigest(text) metaStr = getMetaString(md5_str,label) data = [] data.append(file1) data.append(metaStr) data.append(text) yield data
def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname)): line = cleanse_data(line) yield line.lower().split()
def __iter__(self): for fname in os.listdir(self.dirname): self.flist.append(fname) f = open(os.path.join(self.dirname, fname)) text = str.decode(f.read(), "UTF-8", "ignore") text = cleanse_data(text) yield text
def __iter__(self): # count = 0 #flist = [] for fname in os.listdir(self.dirname): f = open(os.path.join(self.dirname, fname)) text = str.decode(f.read(), "UTF-8", "ignore") text = cleanse_data(text) yield text.lower()
def scan_file(dir_name): ## ## scan every file in a directory ## and extract text from it ## for fname in os.listdir(dir_name): fp = open(os.path.join(dir_name, fname),"r") text = fp.read() yield cleanse_data(text)
def predict_model(self, dirname, w2v_model_path,topN,ndim): tfidf_model = self.get_tfidf_model(dirname) X = [] fn = [] for fname in os.listdir(dirname): # print "Processing :: " + fname f = open(os.path.join(dirname, fname),"r") raw_text = str.decode(f.read(), "UTF-8", "ignore") text = cleanse_data(raw_text) pword,topN = self.top_n_words_doc( w2v_model_path,text,tfidf_model,topN) X_coeff = self.get_docvec( w2v_model_path,tfidf_model, pword, text,topN) fn.append(fname) X.append(X_coeff[0]) return X, fn
def train_model(self, dirname, w2v_model_path,ndim): tfidf_model = self.get_tfidf_model(dirname) w2v_model = self.load_w2vmodel(w2v_model_path) trd = DocumentFeatures() wt_vect_data = [] label_data = [] fn = [] for fname in os.listdir(dirname): f = open(os.path.join(dirname, fname)) text = str.decode(f.read(), "UTF-8", "ignore") text = cleanse_data(text) print "processsing ::" + fname VA0, VA1 =srl_extract(text) sent_vect = trd.get_sent_circconv_vec(text, w2v_model, ndim, 'tfidf', tfidf_model,VA0, VA1) fn.append(fname) wt_vect_data.append(sent_vect[0]) return wt_vect_data, fn
def train_model(self, dirname, w2v_model_path,topN,ndim): tfidf_model = self.get_tfidf_model(dirname) X = [] label_data = [] fn = [] for fname in os.listdir(dirname): # print "Processing = " + fname f = open(os.path.join(dirname, fname),"r") raw_text = str.decode(f.read(), "UTF-8", "ignore") text = cleanse_data(raw_text) pword,topN = self.top_n_words_doc( w2v_model_path,text,tfidf_model,topN) X_coeff = self.get_docvec( w2v_model_path,tfidf_model, pword, text,topN) if fname[-10:-4] == 'accept': label = 1 else: label = 0 fn.append(fname) X.append(X_coeff[0]) label_data.append(label) return X, label_data, fn