예제 #1
0
def main(infile, ignore_case):
    # text
    text = []
    for item in utils.fileLineIter(infile):
        url = item[0]
        category = item[1]
        subcat = item[2]
        title = item[3]
        content = item[4]
        #text.append(title+" "+content)
        cleaned_text = CategoryDataUtils.clean_str(title + " " + content)
        text.append(cleaned_text)

    vocabproc = tf.contrib.learn.preprocessing.VocabularyProcessor(
        400, min_frequency=20)
    vocabproc.fit_transform(text)
    vp_size = len(vocabproc.vocabulary_)
    inner = {}
    # google model
    with open("Dataset/GoogleNews-vectors-negative300.bin", "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())
        print("  [*]Google:vocab_size:%s" % (vocab_size))
        binary_len = np.dtype('float32').itemsize * layer1_size
        for line in range(vocab_size):
            word = []
            while True:
                ch = f.read(1).decode('latin-1')
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            if ignore_case:
                word = word.lower()
            idx = vocabproc.vocabulary_.get(word)
            f.read(binary_len)
            if idx != 0:
                inner[word] = True
            #myprint("%s " %(word))

    myprint("Inner join/Total Vocabulary : %s/%s\n" %
            (len(inner), len(vocabproc.vocabulary_)))
    myprint("Word Not In Google Word2Vec:")
    for word in vocabproc.vocabulary_._mapping:
        if word not in inner:
            myprint(word)

    myprint("Word In Google Word2Vec:")
    for word in vocabproc.vocabulary_._mapping:
        if word in inner:
            myprint(word)
    return
예제 #2
0
 def _predict(self,text_list):
     if type(text_list)==type(""):
         text_list=[text_list]
     text_list = CategoryDataUtils.textListPreprocess(text_list) #clean str 
     vec=np.array(list(self.vocabproc.transform(text_list))) 
     feed_dict={
         self.rcnn.input_text: vec,
         self.rcnn.dropout_keep_prob:1.0 
         }
     predictions,prob = self.sess.run([self.rcnn.predictions,self.rcnn.prob], feed_dict)
     result=[]
     for item in prob:
         cur_article={}
         for i in range(len(item)):
             cur_article[self.idx2cat[i]]=item[i]
         result.append(cur_article)
     return result 
예제 #3
0
def process(in_file,out_file,max_content_length,max_size_per_category):
    print("[*]max size:%s" %(max_content_length))
    print("[*]%s -> %s" %(in_file,out_file))
    list=CategoryDataUtils.GetDataList(in_file)
    fout=open(out_file,"wb")
    category_cnt={}
    for category,url,title,content,subcategory in list:
        #
        if category not in category_cnt:
            category_cnt[category]=0 
        if category_cnt[category] >= max_size_per_category:
            continue
        category_cnt[category]+=1 
        #truncate content
        content=getTruncatedContent(content,max_content_length)
        line_out="%s\t%s\t%s\t%s\t%s\r\n" %(url,category,subcategory,title,content)
        line_out=line_out.encode("utf-8");  
        fout.write(line_out)
    fout.close()
    for cat in category_cnt:
        print("[*] %s :%d" %(cat,category_cnt[cat]))
    return 
예제 #4
0
파일: MyData.py 프로젝트: vcvycy/Category
    def __init__(
            self,
            datafile,
            minSizePerCategory,
            max_article_length=400,
            min_frequence=20,
            training_share=0.9,
            droupout=1.0  # not implement now
    ):
        self.text_data = {}  # category ->article word list
        self.total_size = 0
        self.initTextData(datafile, minSizePerCategory)
        self.data, self.vocabSize, self.vocabproc = CategoryDataUtils.category_dict2vec_vocabular_processor(
            self.text_data,
            max_article_length=400,
            min_freq=min_frequence,
        )
        self.droupout = droupout
        # split self.data into training/testing data
        self.test_data = {}  #{"world":["article","artile"...],...}
        self.training_share = training_share
        self.splitIntoTrainingAndTestData()  #
        # cat 2 idx (sorted)
        self.cat2idx = {}
        cats = []
        for cat in self.text_data:
            cats.append(cat)
        cats.sort()
        for i in range(len(cats)):
            self.cat2idx[cats[i]] = i
        # one hot
        self.oneHot = []
        for i in range(self.getClasses()):
            y = [0 for _ in range(self.getClasses())]
            y[i] = 1
            self.oneHot.append(y)

        self.showDetail()
        return
예제 #5
0
파일: MyData.py 프로젝트: vcvycy/Category
 def initTextData(self, datafile, minSizePerCategory):
     #
     print("[*]init Text Data")
     t_start = time.time()
     self.text_data = {}
     list = CategoryDataUtils.GetDataList(datafile)
     for category, url, title, content, _ in list:
         if not (category in self.text_data):
             self.text_data[category] = []
         #self.text_data[category].append(ArticleFilter.considerHost(url,title,content))
         self.text_data[category].append(
             ArticleFilter.regular(url, title, content))
     #
     tmp = {}
     for cat in self.text_data:
         if len(self.text_data[cat]) >= minSizePerCategory:
             tmp[cat] = self.text_data[cat]
             self.total_size += len(tmp[cat])
         else:
             print("[!] Category %s ; sample size: %d removed" %
                   (cat, len(self.text_data[cat])))
     self.text_data = tmp
     return
예제 #6
0
 def predict_split(self,text_list):
     if type(text_list)==type(""):
         text_list=[text_list]
     text_list = CategoryDataUtils.textListPreprocess(text_list) #clean str 
     _vec=list(self.vocabproc.transform(text_list))
     partNum=1
     partSize=int(400/partNum)
     vec=[]
     for item in _vec: 
         for i in range(partNum):
             tmp=item.copy()
             for j in range(400):
                 if j>partSize*i and j< partSize*(i+1):
                     tmp[j]=0 
             vec.append(tmp)
     vec=np.array(vec)
     print(len(vec))
     result=[]
     print("[*]transform success");
     for i in range(0,len(vec),500):
         feed_dict={
             self.rcnn.input_text: vec[i:i+500],
             self.rcnn.dropout_keep_prob:1.0 
             }
         predictions,prob = self.sess.run([self.rcnn.predictions,self.rcnn.prob], feed_dict) 
         for item in prob:
             cur_article={}
             for j in range(len(item)):
                 cur_article[self.idx2cat[j]]=item[j]
             result.append(cur_article)
         print("[*]progress:%d/%d" %(i,len(vec)))
     #merge
     _result=[]
     for i in range(0,len(result),partNum):
         _result.append(merge_result_mean(result[i:i+partNum]))
         
     return _result 
예제 #7
0
 def predict(self,text_list,preserve_words=400):
     if type(text_list)==type(""):
         text_list=[text_list] 
     text_list = [CategoryDataUtils.clean_str(sent) for sent in text_list]
     vec=np.array(list(self.vocabproc.transform(text_list))) 
      
     for i in range(len(vec)):
         for j in range(preserve_words,400):
             vec[i][j]=0 
     result=[]
     print("[*]transform success");
     for i in range(0,len(vec),500):
         feed_dict={
             self.rcnn.input_text: vec[i:i+500],
             self.rcnn.dropout_keep_prob:1.0 
             }
         predictions,prob = self.sess.run([self.rcnn.predictions,self.rcnn.prob], feed_dict) 
         for item in prob:
             cur_article={}
             for j in range(len(item)):
                 cur_article[self.idx2cat[j]]=item[j]
             result.append(cur_article)
         print("[*]progress:%d/%d" %(i,len(vec)))
     return result 
예제 #8
0
def considerHost(url,title,content):
    x="%s %s %s" %(Utils.getHostFromUrl(url),title,content)
    return CategoryDataUtils.clean_str(x)