def feature_apply(feature_extractor, feature_vector, attribute, number_of_file): """ Extract features from each document :param feature_extractor: function that extract features :param feature_vector: contains a list of features :param attribute: indicate if the process for gender or age feature extraction :param number_of_file: number of document to be processed :return:vector that contain the extracted features """ corpus_root = '/root/Downloads/TextMining/pan13-author-profiling-training-corpus-2013-01-09/en' #corpus_root = '/root/Downloads/TextMining/pan13-author-profiling-training-corpus-2013-01-09/meTets' newcorpus = XMLCorpusReader(corpus_root, '.*') i=0 feature_set = [] doc_list = newcorpus.fileids() print len(doc_list) for doc in doc_list[:number_of_file]: i+=1 if i%50==0: print i doc = newcorpus.xml(doc) number_of_conversation=int(doc[0].attrib["count"]) #print(doc[0].attrib["count"]) txt = " ".join([doc[0][j].text for j in range(number_of_conversation) if doc[0][j].text is not None]) #print txt if textstat.sentence_count(txt) != 0: feature_set.append((feature_extractor(txt, feature_vector), doc.attrib[attribute])) return feature_set
def test_set(corpus_dir, feature_extrator, vect_path, i): """ Read ,process the test set and extract features for each document :param corpus_dir:path of the test set :param feature_extrator: function that extract features :param vect_path: :param i:index of class in the true_pred dictionay values; if 0 it refers to the gender else it refers to the age :return:vector that contain the extracted features """ vect = create_feature_vect(vect_path) newcorpus = XMLCorpusReader(corpus_dir, '.*') doc_list = newcorpus.fileids() test_feature_set = [] true_pred = extract_true_pred(corpus_dir[:-2]+"truth-en.txt") for doc in doc_list: xml_name = doc doc = newcorpus.xml(doc) print(doc[0].attrib["count"]) txt = fetch_text(doc) if (textstat.sentence_count(txt) != 0) and (txt != ""): test_feature_set.append((feature_extrator(txt, vect), true_pred[xml_name][i])) return test_feature_set
def xml(self, fileids=None, categories=None): fileids, _ = self._resolve(fileids, categories) if len(fileids) == 1: return XMLCorpusReader.xml(self, fileids[0]) else: raise TypeError('Expected a single file')
def xml(self, fileid=None): return XMLCorpusReader.xml(self, fileid)