def process_count_topic(topic,name): counter = Counter(); for section in topic: sz = len(section) print (section[0]) for i in range(1,sz): print ("Problem ",abs(section[i])) # Find tmp_set = [] num = abs(section[i]) input_max = 0 if(num < 800): continue url = url_conv(num) content = TextPre.convert_and_filter(url) global amount_words amount_words += 1 array = \ content.split() #print array for element in array: if len(element) < 3 or element in remove_word or element in tmp_set: continue; counter[element]+=1 tmp_set.append(element) return counter
def download(url,num,path): content = TextPre.convert_and_filter(url) fo = open(path+"/"+str(num)+".txt","w+") fo.write(content) fo.close()
def matrix_constructor_topic(): f = open('info.in','r+') a = f.readline() f.close() print a if(len(a) == 0): print "NO INFO" tmp = a.split() a_s = int(tmp[0]) f_s = int(tmp[1]) ref = {} idx = 0; f = open("dict.txt","r+") a = f.readline() while a != "": tmp = a.split() ref[tmp[0]] = idx print (tmp[0]," ",idx) idx += 1 a = f.readline() X = [[0 for c in range(0,f_s)] for r in range(0,a_s)] Y = [0 for r in range(0,a_s)] lab_idx = 0 smp_idx = 0 for topic in all_topic: for section in topic: sz = len(section) print (section[0]) for i in range(1,sz): #print "Problem ",abs(section[i]) num = abs(section[i]) if(num < 800): continue global amount_samples amount_samples+=1 url = url_conv(num) content = TextPre.convert_and_filter(url) input_num,input_length,des_length = TextPre.max_input_num(content) input_max.append(int(input_num)) # Fetch the input max value input_size.append(int(input_length)) des_size.append(int(des_length)) #print input_max array = \ content.split() #print array for element in array: print (element) if element in ref: print smp_idx," ",ref[element] if X[smp_idx][ref[element]] == 0: X[smp_idx][ref[element]] = 1 #X[smp_idx][ref[element]] += 1 Y[smp_idx] = lab_idx smp_idx += 1 lab_idx += 1 f = open('info.in','a+') f.write("%d" % int(amount_samples)) # Number of samples f.close() return X,Y