# print(v) # print(doc.class_name) if class_name in v: class_label = k train_dict.update({(docid + class_name): [class_label, {term_id: term_val}]}) # write to file with open(training_file_tfidf, "w") as train_obj: for doc, val in train_dict.items(): x = '' classid = str(train_dict[doc][0]) for i in val[1:]: for k, v in i.items(): x = x + " " + str(k) + ":" + str(v) tfidfdata = classid + "\t" + x + "\n" train_obj.write(tfidfdata) print("training data file.tfidf generated successfully") if __name__ == '__main__': '''class_defn_file("class_sample_file") index_obj = InvertedIndex() iindex = index_obj.indexingCranfield("sample_newsgroup") feature_defn_file(iindex,"feature_sample_file") training_file_idf("feature_sample_file", "class_sample_file", "training_sample_file.tf", "training_sample_file.idf", "training_sample_file.tfidf", iindex)''' class_defn_file("class_definition_file") index_obj = InvertedIndex() iindex = index_obj.indexingCranfield("mini_newsgroups") feature_defn_file(iindex, "feature_definition_file") training_file("feature_definition_file", "class_definition_file", "training_data_file.tf", "training_data_file.idf", "training_data_file.tfidf", iindex)
def extractfeature(self, directoryOfNewsgroup, featureDefinitionFile, classDefinitionFile, trainingDataFile): iindexObject = InvertedIndex() invertedIndex = iindexObject.indexingCranfield(directoryOfNewsgroup) f = open(featureDefinitionFile, "w") counter = 0 for x in invertedIndex.items.keys(): counter = counter + 1 formattedData = str(counter) + " " + x + "\n" f.write(formattedData) self.termIdLookup[x] = counter f.close() #as per the proejct requirement hardcoding the class files here and outputting classDefinitiontuple = ("1 comp.graphics", "1 comp.os.ms-windows.misc", "1 comp.sys.ibm.pc.hardware", "1 comp.sys.mac.hardware", "1 comp.windows.x", "2 rec.autos", "2 rec.motorcycles", "2 rec.sport.baseball", "2 rec.sport.hockey", "3 sci.crypt", "3 sci.electronics", "3 sci.med", "3 sci.space", "4 misc.forsale", "5 talk.politics.misc", "5 talk.politics.guns", "5 talk.politics.mideast", "6 talk.religion.misc", "6 alt.atheism", "6 soc.religion.christian") classfile = open(classDefinitionFile, "w") for x in classDefinitiontuple: classfile.write(x + "\n") classfile.close() #end of hardcoded class files print('tf start') libsvmtf = {} if os.path.exists("training_data_file.TF"): os.remove("training_data_file.TF") newsgroup = self.getNewsGroupFile(directoryOfNewsgroup) for x in invertedIndex.items.keys(): for postingobject in invertedIndex.items.get(x).posting.keys(): #libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmtf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(self.getKeysByValue(self.termIdLookup, x)) libsvmtf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(':') #libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmtf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append( round( invertedIndex.items.get(x).posting.get( postingobject).termfreq, 5)) # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') for x in libsvmtf: tfdata = '' libsvmtffile = open("training_data_file.TF", "a") if x in newsgroup.class1items1: classid = 1 if x in newsgroup.class1items2: classid = 2 if x in newsgroup.class1items3: classid = 3 if x in newsgroup.class1items4: classid = 4 if x in newsgroup.class1items5: classid = 5 if x in newsgroup.class1items6: classid = 6 # print('\t '.join(libsvmtf)) #tfdata = str(x) +" : "+str(''.join(str(libsvmtf[x]).split(",")))[1:-1] + "\n" # for row in reader: # read a row as {column1: value1, column2: value2,...} # for (k, v) in row.items(): # go over each column name and value # columns[k].append(v) # append the value into the appropriate list # saved_column = df.column_name # you can also use df['column_name'] #print (str(tempstr).split("'","")) tfdata = str(classid) + " " + str(''.join( str(libsvmtf[x]).split(",")))[1:-1] + "\n" tfdata = str.replace(tfdata, " ':' ", ":") print(tfdata) libsvmtffile.write(tfdata) libsvmtffile.close() print('tf complete') print('idf start') libsvmidf = {} if os.path.exists("training_data_file.IDF"): os.remove("training_data_file.IDF") newsgroup = self.getNewsGroupFile(directoryOfNewsgroup) for x in invertedIndex.items.keys(): for postingobject in invertedIndex.items.get(x).posting.keys(): # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(self.getKeysByValue(self.termIdLookup, x)) libsvmidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(':') # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(invertedIndex.items.get(x).idf) # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') for x in libsvmidf: idfdata = '' libsvmidffile = open("training_data_file.IDF", "a") if x in newsgroup.class1items1: classid = 1 if x in newsgroup.class1items2: classid = 2 if x in newsgroup.class1items3: classid = 3 if x in newsgroup.class1items4: classid = 4 if x in newsgroup.class1items5: classid = 5 if x in newsgroup.class1items6: classid = 6 # print('\t '.join(libsvmtf)) # tfdata = str(x) +" : "+str(''.join(str(libsvmtf[x]).split(",")))[1:-1] + "\n" # for row in reader: # read a row as {column1: value1, column2: value2,...} # for (k, v) in row.items(): # go over each column name and value # columns[k].append(v) # append the value into the appropriate list # saved_column = df.column_name # you can also use df['column_name'] idfdata = str(classid) + " " + str(''.join( str(libsvmidf[x]).split(",")))[1:-1] + "\n" idfdata = str.replace(idfdata, " ':' ", ":") # print(idfdata) libsvmidffile.write(idfdata) libsvmidffile.close() print('idf complete') print('TF-idf start') libsvmtfidf = {} if os.path.exists("training_data_file.TFIDF"): os.remove("training_data_file.TFIDF") newsgroup = self.getNewsGroupFile(directoryOfNewsgroup) for x in invertedIndex.items.keys(): for postingobject in invertedIndex.items.get(x).posting.keys(): # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmtfidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(self.getKeysByValue(self.termIdLookup, x)) libsvmtfidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(':') # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmtfidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append( invertedIndex.items.get(x).posting.get( postingobject).termfreq * invertedIndex.items.get(x).idf) # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') for x in libsvmtfidf: tfidfdata = '' libsvmtfidffile = open("training_data_file.TFIDF", "a") if x in newsgroup.class1items1: classid = 1 if x in newsgroup.class1items2: classid = 2 if x in newsgroup.class1items3: classid = 3 if x in newsgroup.class1items4: classid = 4 if x in newsgroup.class1items5: classid = 5 if x in newsgroup.class1items6: classid = 6 # print('\t '.join(libsvmtf)) # tfdata = str(x) +" : "+str(''.join(str(libsvmtf[x]).split(",")))[1:-1] + "\n" # for row in reader: # read a row as {column1: value1, column2: value2,...} # for (k, v) in row.items(): # go over each column name and value # columns[k].append(v) # append the value into the appropriate list # saved_column = df.column_name # you can also use df['column_name'] tfidfdata = str(classid) + " " + str(''.join( str(libsvmtfidf[x]).split(",")))[1:-1] + "\n" tfidfdata = str.replace(tfidfdata, " ':' ", ":") # print(tfidfdata) libsvmtfidffile.write(tfidfdata) libsvmtfidffile.close() print('TF-idf complete')