def main(infile, ignore_case): # text text = [] for item in utils.fileLineIter(infile): url = item[0] category = item[1] subcat = item[2] title = item[3] content = item[4] #text.append(title+" "+content) cleaned_text = CategoryDataUtils.clean_str(title + " " + content) text.append(cleaned_text) vocabproc = tf.contrib.learn.preprocessing.VocabularyProcessor( 400, min_frequency=20) vocabproc.fit_transform(text) vp_size = len(vocabproc.vocabulary_) inner = {} # google model with open("Dataset/GoogleNews-vectors-negative300.bin", "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) print(" [*]Google:vocab_size:%s" % (vocab_size)) binary_len = np.dtype('float32').itemsize * layer1_size for line in range(vocab_size): word = [] while True: ch = f.read(1).decode('latin-1') if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) if ignore_case: word = word.lower() idx = vocabproc.vocabulary_.get(word) f.read(binary_len) if idx != 0: inner[word] = True #myprint("%s " %(word)) myprint("Inner join/Total Vocabulary : %s/%s\n" % (len(inner), len(vocabproc.vocabulary_))) myprint("Word Not In Google Word2Vec:") for word in vocabproc.vocabulary_._mapping: if word not in inner: myprint(word) myprint("Word In Google Word2Vec:") for word in vocabproc.vocabulary_._mapping: if word in inner: myprint(word) return
def _predict(self,text_list): if type(text_list)==type(""): text_list=[text_list] text_list = CategoryDataUtils.textListPreprocess(text_list) #clean str vec=np.array(list(self.vocabproc.transform(text_list))) feed_dict={ self.rcnn.input_text: vec, self.rcnn.dropout_keep_prob:1.0 } predictions,prob = self.sess.run([self.rcnn.predictions,self.rcnn.prob], feed_dict) result=[] for item in prob: cur_article={} for i in range(len(item)): cur_article[self.idx2cat[i]]=item[i] result.append(cur_article) return result
def process(in_file,out_file,max_content_length,max_size_per_category): print("[*]max size:%s" %(max_content_length)) print("[*]%s -> %s" %(in_file,out_file)) list=CategoryDataUtils.GetDataList(in_file) fout=open(out_file,"wb") category_cnt={} for category,url,title,content,subcategory in list: # if category not in category_cnt: category_cnt[category]=0 if category_cnt[category] >= max_size_per_category: continue category_cnt[category]+=1 #truncate content content=getTruncatedContent(content,max_content_length) line_out="%s\t%s\t%s\t%s\t%s\r\n" %(url,category,subcategory,title,content) line_out=line_out.encode("utf-8"); fout.write(line_out) fout.close() for cat in category_cnt: print("[*] %s :%d" %(cat,category_cnt[cat])) return
def __init__( self, datafile, minSizePerCategory, max_article_length=400, min_frequence=20, training_share=0.9, droupout=1.0 # not implement now ): self.text_data = {} # category ->article word list self.total_size = 0 self.initTextData(datafile, minSizePerCategory) self.data, self.vocabSize, self.vocabproc = CategoryDataUtils.category_dict2vec_vocabular_processor( self.text_data, max_article_length=400, min_freq=min_frequence, ) self.droupout = droupout # split self.data into training/testing data self.test_data = {} #{"world":["article","artile"...],...} self.training_share = training_share self.splitIntoTrainingAndTestData() # # cat 2 idx (sorted) self.cat2idx = {} cats = [] for cat in self.text_data: cats.append(cat) cats.sort() for i in range(len(cats)): self.cat2idx[cats[i]] = i # one hot self.oneHot = [] for i in range(self.getClasses()): y = [0 for _ in range(self.getClasses())] y[i] = 1 self.oneHot.append(y) self.showDetail() return
def initTextData(self, datafile, minSizePerCategory): # print("[*]init Text Data") t_start = time.time() self.text_data = {} list = CategoryDataUtils.GetDataList(datafile) for category, url, title, content, _ in list: if not (category in self.text_data): self.text_data[category] = [] #self.text_data[category].append(ArticleFilter.considerHost(url,title,content)) self.text_data[category].append( ArticleFilter.regular(url, title, content)) # tmp = {} for cat in self.text_data: if len(self.text_data[cat]) >= minSizePerCategory: tmp[cat] = self.text_data[cat] self.total_size += len(tmp[cat]) else: print("[!] Category %s ; sample size: %d removed" % (cat, len(self.text_data[cat]))) self.text_data = tmp return
def predict_split(self,text_list): if type(text_list)==type(""): text_list=[text_list] text_list = CategoryDataUtils.textListPreprocess(text_list) #clean str _vec=list(self.vocabproc.transform(text_list)) partNum=1 partSize=int(400/partNum) vec=[] for item in _vec: for i in range(partNum): tmp=item.copy() for j in range(400): if j>partSize*i and j< partSize*(i+1): tmp[j]=0 vec.append(tmp) vec=np.array(vec) print(len(vec)) result=[] print("[*]transform success"); for i in range(0,len(vec),500): feed_dict={ self.rcnn.input_text: vec[i:i+500], self.rcnn.dropout_keep_prob:1.0 } predictions,prob = self.sess.run([self.rcnn.predictions,self.rcnn.prob], feed_dict) for item in prob: cur_article={} for j in range(len(item)): cur_article[self.idx2cat[j]]=item[j] result.append(cur_article) print("[*]progress:%d/%d" %(i,len(vec))) #merge _result=[] for i in range(0,len(result),partNum): _result.append(merge_result_mean(result[i:i+partNum])) return _result
def predict(self,text_list,preserve_words=400): if type(text_list)==type(""): text_list=[text_list] text_list = [CategoryDataUtils.clean_str(sent) for sent in text_list] vec=np.array(list(self.vocabproc.transform(text_list))) for i in range(len(vec)): for j in range(preserve_words,400): vec[i][j]=0 result=[] print("[*]transform success"); for i in range(0,len(vec),500): feed_dict={ self.rcnn.input_text: vec[i:i+500], self.rcnn.dropout_keep_prob:1.0 } predictions,prob = self.sess.run([self.rcnn.predictions,self.rcnn.prob], feed_dict) for item in prob: cur_article={} for j in range(len(item)): cur_article[self.idx2cat[j]]=item[j] result.append(cur_article) print("[*]progress:%d/%d" %(i,len(vec))) return result
def considerHost(url,title,content): x="%s %s %s" %(Utils.getHostFromUrl(url),title,content) return CategoryDataUtils.clean_str(x)