Esempio n. 1
0
def convertFiles2TextIter(in_dir,label):
    index = 0

    out_dir = ''
    for file1 in os.listdir(in_dir):

        filePath = os.path.join(in_dir, file1)

        if os.path.isfile(filePath):
            index += 1

            cvfile = filePath         
             
            cvparser = CVParser(cvfile,label,out_dir)
            if cvparser.errorMsg:
                print cvparser.errorMsg
                sys.exit(0)
           
            text = cvparser.preprocess(index)
        
            text = cleanse_data(text)
            md5_str = getMD5HashDigest(text)
        
            metaStr = getMetaString(md5_str,label)
    
            data = []
            data.append(file1)
            data.append(metaStr)
            data.append(text)

            yield data 
Esempio n. 2
0
	def __iter__(self):

		for fname in os.listdir(self.dirname):

			for line in open(os.path.join(self.dirname, fname)):
				
				line = cleanse_data(line)
				yield line.lower().split()
Esempio n. 3
0
	def __iter__(self):

		for fname in os.listdir(self.dirname):
			self.flist.append(fname)

			f = open(os.path.join(self.dirname, fname))
			text = str.decode(f.read(), "UTF-8", "ignore")
			text = cleanse_data(text)
			yield text
Esempio n. 4
0
	def __iter__(self):
#		count = 0
		#flist = []
		for fname in os.listdir(self.dirname):

			f = open(os.path.join(self.dirname, fname))
			text = str.decode(f.read(), "UTF-8", "ignore")
			text = cleanse_data(text)
			yield text.lower()
Esempio n. 5
0
def scan_file(dir_name):

##
	##  scan every file in a directory 
	##  and extract text from it
##
	for fname in os.listdir(dir_name):
		fp = open(os.path.join(dir_name, fname),"r")
		text = fp.read()
		yield cleanse_data(text)
Esempio n. 6
0
	def predict_model(self, dirname, w2v_model_path,topN,ndim):
		tfidf_model = self.get_tfidf_model(dirname)
		X = []
	
		fn = []
		for fname in os.listdir(dirname):
		#	print "Processing :: " + fname
			f = open(os.path.join(dirname, fname),"r")
			raw_text = str.decode(f.read(), "UTF-8", "ignore")
			text = cleanse_data(raw_text)
			pword,topN = self.top_n_words_doc( w2v_model_path,text,tfidf_model,topN)
			X_coeff = self.get_docvec( w2v_model_path,tfidf_model, pword, text,topN)

			fn.append(fname)
			X.append(X_coeff[0])
	
		return X, fn
Esempio n. 7
0
	def train_model(self, dirname, w2v_model_path,ndim):
		
		tfidf_model = self.get_tfidf_model(dirname)
		w2v_model = self.load_w2vmodel(w2v_model_path)
		trd = DocumentFeatures()
		wt_vect_data = []
		label_data = []
		fn = []
		for fname in os.listdir(dirname):
			f = open(os.path.join(dirname, fname))
			text = str.decode(f.read(), "UTF-8", "ignore")
			text = cleanse_data(text)	
			print "processsing ::" + fname
			VA0, VA1 =srl_extract(text)
			sent_vect = trd.get_sent_circconv_vec(text, w2v_model, ndim, 'tfidf', tfidf_model,VA0, VA1)

			fn.append(fname)
			wt_vect_data.append(sent_vect[0])

		return wt_vect_data, fn
Esempio n. 8
0
	def train_model(self, dirname, w2v_model_path,topN,ndim):
		tfidf_model = self.get_tfidf_model(dirname)
		X = []
		label_data = []
		fn = []
		for fname in os.listdir(dirname):
		#	print "Processing = " + fname
			f = open(os.path.join(dirname, fname),"r")
			raw_text = str.decode(f.read(), "UTF-8", "ignore")
			text = cleanse_data(raw_text)
			pword,topN = self.top_n_words_doc( w2v_model_path,text,tfidf_model,topN)
			X_coeff = self.get_docvec( w2v_model_path,tfidf_model, pword, text,topN)

			if fname[-10:-4] == 'accept':
				label = 1
			else:
				label = 0
			fn.append(fname)
			X.append(X_coeff[0])
			label_data.append(label)
		return X, label_data, fn