def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "iTrain.csv") OUTPUT_FILE = os.path.join("..", "..", "data", "folds", "iTrain.csv") parser.add_argument('-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help="File to be processed. Defaults to '%(default)s'") parser.add_argument('--ns', '--num-samples', dest='num_samples', default=None, type=int, help='The number of samples') parser.add_argument('--nf', '--num-folds', dest='num_folds', default=5, type=int, help='The number of folds') parser.add_argument('-o', '--output-file', type=str, dest='output_file', default=OUTPUT_FILE, help="File to be used to save the fold indices. Defaults to '%(default)s'") args = parser.parse_args() # read the data if args.num_samples == None: data_in = pandas.read_csv(args.input_file, encoding="ISO-8859-1") num_samples = len(data_in) else: num_samples = args.num_samples # generate the folds from random import shuffle indices_list = list(range(0, num_samples)) shuffle(indices_list) retval = [] folds_indices = [x % args.num_folds + 1 for x in indices_list] # write the indices to file helpers.ensure_dir(os.path.dirname(args.output_file)) folds_indices_towrite = pandas.Series(numpy.array(folds_indices)) folds_indices_towrite.to_csv(args.output_file, index=False) # write to file, but don't give row names
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl") OUTPUT_FILE = os.path.join("..", "..", "data", "params", 'params-decisiontree.pkl') parser.add_argument('-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help="File with the list of item classes and features. Defaults to '%(default)s'") parser.add_argument('-c', '--classifier', type=str, dest='classifier', default="decision-tree", help="The classifier to be used. Defaults to '%(default)s'") parser.add_argument('-o', '-output-file', type=str, dest='output_file', default=OUTPUT_FILE, help="A file to output the predicted labels. Defaults to '%(default)s'") args = parser.parse_args() # Read the input dictionary type_classes, source_classes, token_container = pickle.load(open(args.input_file, "rb")) type_dict = {'IGI': 0, 'IC': 1, 'IV': 2, 'IG': 3} # get all the label data labels_orig = [type_dict[x] for x in type_classes] data_orig = token_container if args.classifier == 'decision-tree': import ipdb; ipdb.set_trace() clf = tree.DecisionTreeClassifier(criterion='entropy') parameters = { 'clf__max_depth': [i for i in range(25, 200, 25)], } elif args.classifier == 'random-forest': clf = ensemble.RandomForestClassifier(criterion='entropy') parameters = { 'clf__max_depth': [i for i in range(25, 200, 25)], } else: #'logistic-regression': clf = linear_model.LogisticRegression() parameters = { 'clf__C': [0.5, 1, 5, 10], } ppl = pipeline.Pipeline([ ('vectorizer', feature_extraction.DictVectorizer(sparse=True)), #sparse=True ('clf', clf), ]) gs = grid_search.GridSearchCV(ppl, parameters, verbose=1, cv=5) gs.fit(data_orig, labels_orig) print(gs.best_params_, gs.best_score_) helpers.ensure_dir(os.path.dirname(args.output_file)) pickle.dump([gs.best_params_, gs.best_params_], open(args.output_file, "wb" ))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl") OUTPUT_FILE = os.path.join("..", "..", "data", "params", 'params-decisiontree.pkl') parser.add_argument('-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help="File with the list of item classes and features. Defaults to '%(default)s'") parser.add_argument('-n', '-ngrams', dest = 'nGrams', type = int, default = 1, help = 'Defines how to split words by ngrams. Default is tokenized to one word ngrams' ) parser.add_argument('--ti', dest='tf_idf', action='store_true', default=False, help='Boolean - If set, TfIdf features will be used') parser.add_argument('-b', dest='binary', action='store_true', default=False, help='Boolean - If set, CountVecorizer will use binary counts instead or frequency counts') parser.add_argument('-c', '--classifier', type=str, dest='classifier', default="decision-tree", help="The classifier to be used. Defaults to '%(default)s'") parser.add_argument('--cat', type=str, dest='category', default='income-type', choices=('income-type','income-source','expenditure-type'), help="The type of categorization. Defaults to '%(default)s'") parser.add_argument('-o', '-output-file', type=str, dest='output_file', default=OUTPUT_FILE, help="A file to output the predicted labels. Defaults to '%(default)s'") args = parser.parse_args() # Read the input dictionary data_in = pandas.read_pickle(args.input_file) type_classes = list(data_in['type_class']) source_classes = list(data_in['source_class']) frID = list(data_in['frID']) data_orig = data_in['description'] if args.category == 'income-type' or args.category == 'expenditure-type': labels_orig = [str(i) for i in type_classes] # converting them to strings if they are not strings already else: labels_orig = [str(i) for i in source_classes] # converting them to strings if they are not strings already if args.classifier == 'decision-tree': #import ipdb; ipdb.set_trace() clf = tree.DecisionTreeClassifier(criterion='entropy') parameters = { 'clf__max_depth': [i for i in range(25, 200, 25)], #200 } elif args.classifier == 'random-forest': clf = ensemble.RandomForestClassifier(criterion='entropy') parameters = { 'clf__max_depth': [i for i in range(25, 200, 25)], } elif args.classifier == 'logistic-regression': #'logistic-regression': clf = linear_model.LogisticRegression() parameters = { 'clf__C': [0.5, 1, 5, 10], } else: # SVM clf = svm.SVC() parameters = { 'clf__C': [0.5, 1.0, 5.0, 10], } vectorizer = feature_extraction.text.CountVectorizer( analyzer='word', #whether should be made ofword or char n-grams binary=args.binary, # if True all non-zero counts are set to one - used for probabilistic mapping decode_error= 'strict', # Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding #dtype='numpy.int64', # Type of the matrix returned by fit_transform() or transform() encoding="ISO-8859-15", # input='content', # can be 'file', 'filename' or 'content' lowercase=False, #Convert all characters to lowercase before tokenizing. max_df=1.0, # When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None." max_features=None, # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None. ngram_range=(1, args.nGrams), # The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. preprocessor=None, # Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. stop_words=None, # min_df=1, strip_accents=None, token_pattern = '(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None ) if args.tf_idf == True: transformer = feature_extraction.text.TfidfTransformer() ppl = pipeline.Pipeline([ ('vectorizer', vectorizer), ('transformer', transformer), ('clf', clf), ]) else: ppl = pipeline.Pipeline([ ('vectorizer', vectorizer), ('clf', clf), ]) k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True) gs = grid_search.GridSearchCV(ppl, parameters, verbose=3, cv=k_fold) gs.fit(data_orig, labels_orig) print(gs.best_params_, gs.best_score_) helpers.ensure_dir(os.path.dirname(args.output_file)) pickle.dump([gs.best_params_, gs.best_score_], open(args.output_file, "wb" ))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "iTrain.csv") OUTPUT_FILE = os.path.join("..", "..", "data", "features", "data-frame.csv") parser.add_argument('--i', '--input-file', type=str, dest='inputFile', default=INPUT_FILE, help='File to be processed to boolean matrix') parser.add_argument('-l', '--lemmatize', dest='lemmatize', action='store_true', default=False, help='Boolean - If set, verbs will be lemmatized') parser.add_argument('-la', '--lemmatizeall', dest='lemmatizeall', action='store_true', default=False, help='Boolean - If set, all words will be lemmatized') parser.add_argument( '-lc', '--lower-case', dest='lowerCase', action='store_true', default=True, help='Boolean - Defaults to converted all to lower-case') parser.add_argument( '-rw', '--remove-words', dest='removeWords', action='store_true', default=None, help='Accepts a list of types of words to be removed e.g. ...') parser.add_argument('-s', '--stematize', dest='stematize', action='store_true', default=False, help='Boolean - If set, all words will be stematized') parser.add_argument( '--sa', '--strip-accents', dest='stripAccents', action='store_true', default=False, help= "Removes accents on letters replacing them with just the letter itself" ) parser.add_argument( '--sp', '--spelling-corrector', dest='spellCorrect', action='store_true', default=False, help= "Correct spelling mistakes word by word, just taking the most likely correction" ) parser.add_argument( '--sw', '--stop-words', dest='stopWords', action='store_true', default=False, help='Removes the most common words, "stop words", from the text') parser.add_argument('-t', '-tokenize', dest='token', action='store_true', default=False, help='Tokenizes text to individual words') parser.add_argument( '--ta', '--alpha-numeric', dest='alphaNumeric', action='store_false', default=True, help= 'Boolean - If NOT set file will be tokenized and non alpha-numeric words left in. Default is TRUE' ) parser.add_argument('--th', '--token-hyphen', dest='tokenHyphen', action='store_true', default=False, help='Tokenizes text using the hierarchy structure') parser.add_argument( '-uc', '--upper-case', dest='upperCase', action='store_true', default=False, help='Boolean - If set, all words will be converted to upper-case') parser.add_argument( '-o', '--output-file', type=str, dest='output_file', default=OUTPUT_FILE, help= "Directory to be used to save the created master set. Filename will be automatically created based on input flags. Defaults to something needsto go here" ) args = parser.parse_args() data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-1") #set column names? words = pandas.DataFrame({ 'type': data_in.type_class, 'class': data_in.source_class, 'description': data_in.description }) #set column names? #print(words.head()) #============================================================================= # specify data frame of the required length #============================================================================= #============================================================================== # # frame_len = 0 # #for x in range(1, len(words)): # # # temp_len = len(words.description[x].split()) # # if temp_len > frame_len: # frame_len = temp_len # print(frame_len) # print(words.description[x].split()) # #============================================================================== processed_data = pandas.DataFrame(data_in[['type_class', 'source_class']]) # function calls need to be edited to send the relevant columns (excluding type and source). # it will require a restructuring of data types. Not sure if can use a dynamic data frame # as don't know how many words in each row - also will be different depending on processing #tokenize the text either straight or keeping only alpha-numeric(default) if args.alphaNumeric: # keep just the alpha-numeric characters from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') word_list = list(map(tokenizer.tokenize, words.description)) #print("alpha numeric only") #print(word_list[1:20]) # else: # word_list = list(map(nltk.word_tokenize, words.description)) #print(word_list[1:20]) if args.tokenHyphen: #print("STEMATIZE") word_list = list( map(text_processing.tokenize_on_hyphen, words.description)) #print(word_list[1:20]) # lower case if args.lowerCase: #print("LOWER CASE") word_list = list(map(text_processing.make_lower, word_list)) #print(word_list[1:20]) # Upper case if args.upperCase: #print("UPPER CASE") word_list = list(map(text_processing.make_upper, word_list)) #print(word_list[1:20]) if args.lemmatizeall: #print("LEMMATIZE ALL") word_list = list(map(text_processing.lemmatizeall, word_list)) #print(word_list[1:20]) if args.lemmatize: #print("LEMMATIZE") word_list = list(map(text_processing.lemmatize, word_list)) #print(word_list[1:20]) if args.removeWords: print("REMOVE WORDS") #needs function in text_processing if args.stematize: #print("STEMATIZE") word_list = list(map(text_processing.stematize, word_list)) #print(word_list[1:20]) if args.stopWords: #print("STOP WORDS") word_list = list(map(text_processing.exclude_stop_words, word_list)) #print(word_list[1:20]) wl_df = pandas.DataFrame(word_list) frames = [processed_data, wl_df] output_df = pandas.concat(frames, axis=1) #print(output_df[1:20]) helpers.ensure_dir(os.path.dirname(args.output_file)) output_df.to_csv(args.output_file, index=False) # write to file, but don't give row names
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl") OUTPUT_DIR = os.path.join("..", "..", "data", "classifiers", "decision_trees") FOLD_FILE = os.path.join("..", "..", "data", "folds", "newfolds.csv") parser.add_argument( '-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help= "File with the list of item classes and features. Defaults to '%(default)s'" ) parser.add_argument( '--fn', '--fold-number', dest='fold_number', default=None, type=int, help= "The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered. If None, will train the classifier for all the folds in a loop. Defaults to '%(default)s'" ) parser.add_argument( '--ff', '--fold-file', type=str, dest='foldFile', default=FOLD_FILE, help= "Fold file containing the cross-fold validation indices. Defaults to '%(default)s'" ) parser.add_argument( '-o', '--output-dir', type=str, dest='output_dir', default=OUTPUT_DIR, help="Directory to save the decision trees. Defaults to '%(default)s'") args = parser.parse_args() # Read the input dictionary type_classes, source_classes, token_container = pickle.load( open(args.input_file, "rb")) type_dict = {'IGI': 0, 'IC': 1, 'IV': 2, 'IG': 3} # get all the label data labels_orig = [type_dict[x] for x in type_classes] # Create output directory helpers.ensure_dir(args.output_dir) dtree = tree.DecisionTreeClassifier(random_state=0, max_depth=100, criterion='entropy') vectorizer = feature_extraction.DictVectorizer(sparse=True) if args.fold_number == None: # loop over all folds and create the classifier print("Training will be done iteratively for all folds...\n") cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0] for fold in cross_fold_indices.unique(): print("Training classifier for fold %d...\n" % fold) data_fold = [ token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold ] labels_fold = [ labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold ] data_matrix = vectorizer.fit_transform(data_fold) dtree.fit(data_matrix, labels_fold) joblib.dump( dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % fold)) joblib.dump( vectorizer, os.path.join(args.output_dir, 'vectorizer-fold%d.pkl' % fold)) labels_pred = dtree.predict(data_matrix) score = metrics.accuracy_score(labels_fold, labels_pred) print("Accuracy on fold %d (train set): %.5f" % (fold, score)) tree.export_graphviz( dtree, out_file=os.path.join(args.output_dir, 'tree-fold%d.dot' % fold), max_depth=5) #, feature_names = master_in.values) elif args.fold_number == 0: # use all the data to train the classifier print("Training classifier for the full set...\n") data_matrix = vectorizer.fit_transform(token_container) import ipdb ipdb.set_trace() dtree.fit(data_matrix, labels_orig) joblib.dump(dtree, os.path.join(args.output_dir, 'tree.pkl')) joblib.dump(vectorizer, os.path.join(args.output_dir, 'vectorizer.pkl')) tree.export_graphviz(dtree, out_file=os.path.join(args.output_dir, 'tree.dot'), max_depth=5) #, feature_names = master_in.values) else: # create classifier for a particular fold fold = args.fold_number print("Training classifier for fold %d...\n" % fold) cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0] data_fold = [ token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold ] labels_fold = [ labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold ] data_matrix = vectorizer.fit_transform(data_fold) dtree.fit(data_matrix, labels_fold) joblib.dump( dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % args.fold_number)) joblib.dump( vectorizer, os.path.join(args.output_dir, 'vectorizer-fold%d.pkl' % args.fold_number)) labels_pred = dtree.predict(data_matrix) score = metrics.accuracy_score(labels_fold, labels_pred) print("Accuracy on fold %d (train set): %.5f" % (fold, score))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "iTrain_extra_lastitem_mod_new.csv") #iTrain.csv FOLD_FILE = os.path.join("..", "..", "data", "folds", "iTrain_fold.csv") #iTrain.csv OUTPUT_FILE = os.path.join("..", "..", "data", "features", "new", "dframe_new_iTrain.csv") # BAD_OUTPUT_FILE = os.path.join("..", "..", "data", "features", "welsh_iTrain.csv") # SPELL_CHECKER_PATH = os.path.join("..", "..", "data", "big.txt") parser.add_argument( '-i', '-input-file', type=str, dest='inputFile', default=INPUT_FILE, help= 'File to be processed to a master set. File must be saved in data and argument structured as - ../../data/yourfilename.csv' ) parser.add_argument( '---lan', '---language', dest='language', action='store_false', default=False, help= 'Boolean - If set, language will be determined and non-english items will be removed' ) parser.add_argument('-l', '-lemmatize', dest='lemmatize', action='store_true', default=False, help='Boolean - If set, verbs will be lemmatized') parser.add_argument('--la', '--lemmatizeall', dest='lemmatizeall', action='store_true', default=False, help='Boolean - If set, all words will be lemmatized') parser.add_argument( '--lc', '--lower-case', dest='lowerCase', action='store_false', default=True, help='Boolean - Defaults to converted all to lower-case') parser.add_argument( '--rw', '--remove-words', dest='removeTags', nargs='+', #action='store_true', default=None, help= 'Accepts a list of types of words to be removed from list of ADJ, ADV, CNJ, DET, EX, FW, MOD, N, NP, NUM, PRO, P, TO, UH, V, VD, VG, VN, WH' ) parser.add_argument('-s', '-stematize', dest='stematize', action='store_true', default=False, help='Boolean - If set, all words will be stematized') parser.add_argument( '--sa', '--strip-accents', dest='stripAccents', action='store_false', default=True, help= "Removes accents on letters replacing them with just the letter itself" ) parser.add_argument( '--sp', '--spelling-corrector', dest='spellCorrect', action='store_false', default=True, help= "Correct spelling mistakes word by word, just taking the most likely correction" ) parser.add_argument( '--sd', '--spell-dictionary', dest='spell_dictionary', default=SPELL_CHECKER_PATH, help= "File containing the dictionary to be used for spell-check. Defaults to '%(default)s'" ) parser.add_argument( '--sw', '--stop-words', dest='stopWords', action='store_false', default=True, help='Removes the most common words, "stop words", from the text') # parser.add_argument('-t', # '-tokenize', # dest='tokenize', # action='store_true', # default=False, # help='Tokenizes text to individual words') parser.add_argument( '--ta', '--alpha-numeric', dest='alphaNumeric', action='store_true', default=True, help= 'Boolean - If NOT set file will be tokenized and non alpha-numeric words left in. Default is TRUE' ) parser.add_argument( '--th', '--token-hyphen', dest='tokenHyphen', action='store_true', default=False, help='Tokenizes text using the directory structure from input file') parser.add_argument( '--uc', '--upper-case', dest='upperCase', action='store_true', default=False, help='Boolean - If set, all words will be converted to upper-case') parser.add_argument( '--fn', '--fold-number', dest='fold_number', default=0, type=int, help= "The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered . Defaults to '%(default)s'" ) parser.add_argument( '--ff', '--fold-file', type=str, dest='foldFile', default=FOLD_FILE, help= "Fold file containing the cross-fold validation indices. Defaults to '%(default)s'" ) parser.add_argument( '-o', '-output-file', type=str, dest='output_file', default=OUTPUT_FILE, help= "Directory to be used to save the created master set. Filename will be automatically created based on input flags. Defaults to something needsto go here" ) parser.add_argument( '--bo', '--bad-output-file', type=str, dest='bad_output_file', default=BAD_OUTPUT_FILE, help= "Directory to be used to save the created welsh data set. Filename will be automatically created based on input flags. Defaults to something needsto go here" ) args = parser.parse_args() # data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-1") data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-15", dtype=str) #set column names? words = pandas.DataFrame({ 'frID': data_in.frID, 'type': data_in.type_class, 'class': data_in.source_class, 'description': data_in.description, 'ICNPO_category': data_in.ICNPO_category, 'nicename': data_in.nicename }) processed_data = pandas.DataFrame(data_in[[ 'frID', 'type_class', 'source_class', 'ICNPO_category', 'nicename' ]]) #Define word_list word_list = words.description #print(words.head()) if args.language: #check that text is in english and separate print('Items that are more likely to be Welsh:') langval = text_processing.language(word_list) # for x in range(0, len(word_list)): good = numpy.where([x >= 0.01 for x in langval]) bad = numpy.where([x < 0.01 for x in langval]) badwords = words.drop(words.index[good]) words = words.drop(words.index[bad]) word_list = words.description # helpers.ensure_dir(os.path.dirname(args.bad_output_file)) badwords.to_csv(args.bad_output_file, index=False) # write to file, but don't give row names print('Only items with en < 0.01 are taken to be bad') #tokenize the text either straight or keeping only alpha-numeric(default) if args.alphaNumeric: # keep just the alpha-numeric characters from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') word_list = list(map(tokenizer.tokenize, word_list)) #print("alpha numeric only") #print(word_list[1:20]) # if args.tokenize: # word_list = list(map(tokenizer.tokenize, word_list)) # word_list = list(map(nltk.word_tokenize, words.description)) #print(word_list[1:20]) if args.tokenHyphen: #print("Tokenize on hyphen") word_list = list(map(text_processing.tokenize_on_hyphen, word_list)) #print(word_list[1:20]) # lower case if args.lowerCase: #print("LOWER CASE") word_list = list(map(text_processing.make_lower, word_list)) #print(word_list[1:20]) # Upper case if args.upperCase: #print("UPPER CASE") word_list = list(map(text_processing.make_upper, word_list)) #print(word_list[1:20]) if args.stripAccents: #print("CORRECT SPELLING") i = 0 for x in range(len(word_list)): correctedWords = [ text_processing.strip_accents(y) for y in word_list[x] ] word_list[x] = correctedWords i = i + 1 # if i % 100 == 0 : print('row %d'% i) # word_list = list(map(text_processing.strip_accents, word_list)) #print(word_list[1:20]) if args.spellCorrect: #print("CORRECT SPELLING") word_list = spell_checker.correctall(word_list, args.spell_dictionary) #print(word_list[1:20]) if args.removeTags: #print("REMOVE WORDS") #needs function in text_processing word_list = list( map(text_processing.keep_only_specified_tags, word_list, args.removeTags)) if args.stopWords: #print("STOP WORDS") word_list = list(map(text_processing.exclude_stop_words, word_list)) #print(word_list[1:20]) if args.lemmatizeall: #print("LEMMATIZE ALL") word_list = list(map(text_processing.lemmatizeall, word_list)) #print(word_list[1:20]) if args.lemmatize: #print("LEMMATIZE") word_list = list(map(text_processing.lemmatize, word_list)) print(word_list[1:20]) if args.stematize: #print("STEMATIZE") word_list = list(map(text_processing.stematize, word_list)) #print(word_list[1:20]) wl_df = pandas.DataFrame(word_list) frames = [processed_data, wl_df] output_df = pandas.concat(frames, axis=1) #print(output_df[1:20]) helpers.ensure_dir(os.path.dirname(args.output_file)) output_df.to_csv(args.output_file, index=False) # write to file, but don't give row names
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed # this default file still contains I and IO values so needs updating INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchical", "new", "dframe_iTrain.csv") OUTPUT_DIR = os.path.join("..", "..", "data", "features") OUTPUT_FILENAME = 'temp/data_stri_Please_rename_or_delete_me.p' parser.add_argument('--ad', '--addData', dest='addData', action='store_false', default=True, help='adds charity type and name to description list') parser.add_argument('-i', '-input-file', type=str, dest='inputFile', default=INPUT_FILE, help='Dataframe to be processed to a Bag of words. Defaults to ../../data/features/data_frame_test.csv.' ) parser.add_argument('--lc', '--lastCells', dest='lastCells', type = int, default=0, help='If set will extract the n last cells from the dataframe according to the input number. For use with source_class it needs a hierarchically tokenized dataframe' ) parser.add_argument('--hi', '--hierarchy', dest='hierarchy', action='store_true', default=False, help='If set spaces will be removed from within hierarchical items') parser.add_argument('--wl', '--wordsForLast', dest='wordsForLast', action='store_true', default=False, help='Do not concatonate the last hierarchical item so that individual words are kept') parser.add_argument('--od', '-output-dir', type=str, dest='output_dir', default=OUTPUT_DIR, help="Directory to save the output") parser.add_argument('-o', '-output-file', type=str, dest='outputFilename', default=OUTPUT_FILENAME, help="Filename of output file - must be a pickle.p format") args = parser.parse_args() data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-15" ) data_in_str = pandas.read_csv(args.inputFile, encoding="ISO-8859-15" ,dtype=str) # data_in = data_in[data_in.type_class != 'I'] # data_in = data_in[data_in.type_class != 'IO'] data_stri = data_in_str[['frID','type_class','source_class']] data_stri['description'] = 'Nan' sep = " " if args.hierarchy: print ('in hierarchy separation mode') token_data = data_in.iloc[0:,5:] for i in range(0, len(token_data)): row=token_data.iloc[i,0:] for j in range(0, len(row)): token=row.iloc[j] if args.wordsForLast : if j < (len(row)-1) : nextToken=row.iloc[j+1] if isinstance(nextToken, float) : if not math.isnan(nextToken) : if type(token) == str : token=token.replace(" ", "") else : if type(token) == str : token=token.replace(" ", "") else : if type(token) == str : token=token.replace(" ", "") row[j]=token token_data.iloc[i]=row data_in.iloc[0:,5:]=token_data#.iloc[0:,0:] if args.addData : charity_type=data_in_str.iloc[0:,3] charity_name=data_in_str.iloc[0:,4] if args.lastCells == 0: # effectively processing type_class print('last cells equals 0') token_data = data_in.iloc[0:,5:] i=0 for row in range(0, len(token_data)): token_data_stri = sep.join(map(str, token_data.iloc[row,0:].dropna())) if args.addData : collapsedname = charity_name.iloc[row].replace(" ", "") collapsedtype = charity_type.iloc[row].replace(" ", "") all_description=sep.join([token_data_stri,collapsedname,collapsedtype]) data_stri.iloc[row, 3] = all_description i=i+1 if i % 100 == 0 : print('row %d'% i) else : data_stri.iloc[row, 3] = token_data_stri else: # effectively processing for source class as this selects the last n # segments of the hierachy - needs to be passed the hierachical dataframe #print('last cells equals ', lastCells) for row in range(0, len(data_in)): print(row) all_cells = list() for col in data_in.iloc[int(row), 5:]: #print("col: ", col) all_cells.append(col) all_cells = [x for x in all_cells if str(x) != 'nan'] data_stri.iloc[int(row), 3] = sep.join(all_cells[len(all_cells)- args.lastCells:len(all_cells)]) helpers.ensure_dir(args.output_dir) data_stri.to_pickle(os.path.join(args.output_dir, args.outputFilename))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed # this is the default input directory if nothing is passed INPUT_HS = os.path.join("..", "..", "data", "features", "data_frame_hierarchy.csv") INPUT_HT = os.path.join("..", "..", "data", "features", "hierarchical-tokens.csv") OUTPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-features.pkl") parser.add_argument( "--hs", "--hierarchical-split", type=str, dest="input_hs", default=INPUT_HS, help="A dataframe file containing the hierarchical level tokens for each item in the dataset. Defaults to '%(default)s'", ) parser.add_argument( "--ht", "--hierarchical-tokens", type=str, dest="input_ht", default=INPUT_HT, help="A file containing the selected hierarchical level tokens. Defaults to '%(default)s'", ) parser.add_argument( "-o", "-output-file", type=str, dest="output_file", default=OUTPUT_FILE, help="A file to output the dictionary. Defaults to '%(default)s'", ) args = parser.parse_args() # Read the data data_hierarchy = pandas.read_csv(args.input_hs, encoding="ISO-8859-1") # Read the tokens hierarchy_tokens = pandas.read_csv(args.input_ht, header=None)[0] # Remove samples from classes which are noisy and not useful ("I", "IO") data_hierarchy = data_hierarchy[data_hierarchy.type_class != "I"] data_hierarchy = data_hierarchy[data_hierarchy.type_class != "IO"] columns_hierarchy = data_hierarchy.columns descr_hierarchy = data_hierarchy[columns_hierarchy[2:]] # import ipdb; ipdb.set_trace() hierarchy_token_container = [] for row in descr_hierarchy.iterrows(): row_dict = {} # initialize the rowlist = list(row[1]) rowlist = [x for x in rowlist if str(x) != "nan"] rowlist = [x.strip() for x in rowlist] # remove spaces rowlist = [x.strip(string.punctuation) for x in rowlist] # remove punctuation for token in hierarchy_tokens: try: ind = rowlist.index(token) if ind <= 3: row_dict[token] = rowlist.index(token) else: row_dict[token] = 4 except ValueError: pass hierarchy_token_container += [row_dict] # Extracting the sample classes into more convenient format type_class_list = list(data_hierarchy["type_class"]) source_class_list = list(data_hierarchy["source_class"]) # Save everything alltogether in a pkl file to_dump = [type_class_list, source_class_list, hierarchy_token_container] helpers.ensure_dir(os.path.dirname(args.output_file)) pickle.dump(to_dump, open(args.output_file, "wb"))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl") OUTPUT_DIR = os.path.join("..", "..", "data", "classifiers", "decision_trees") FOLD_FILE = os.path.join("..", "..", "data", "folds", "newfolds.csv") parser.add_argument('-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help="File with the list of item classes and features. Defaults to '%(default)s'") parser.add_argument('--fn', '--fold-number', dest='fold_number', default=None, type=int, help="The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered. If None, will train the classifier for all the folds in a loop. Defaults to '%(default)s'") parser.add_argument('--ff', '--fold-file', type=str, dest='foldFile', default=FOLD_FILE, help="Fold file containing the cross-fold validation indices. Defaults to '%(default)s'") parser.add_argument('-o', '--output-dir', type=str, dest='output_dir', default=OUTPUT_DIR, help="Directory to save the decision trees. Defaults to '%(default)s'") args = parser.parse_args() # Read the input dictionary type_classes, source_classes, token_container = pickle.load(open(args.input_file, "rb")) type_dict = {'IGI': 0, 'IC': 1, 'IV': 2, 'IG': 3} # get all the label data labels_orig = [type_dict[x] for x in type_classes] # Create output directory helpers.ensure_dir(args.output_dir) dtree = tree.DecisionTreeClassifier(random_state=0, max_depth=100, criterion='entropy') vectorizer = feature_extraction.DictVectorizer(sparse=True) if args.fold_number == None: # loop over all folds and create the classifier print("Training will be done iteratively for all folds...\n") cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0] for fold in cross_fold_indices.unique(): print("Training classifier for fold %d...\n" % fold) data_fold = [token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold] labels_fold = [labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold] data_matrix = vectorizer.fit_transform(data_fold) dtree.fit(data_matrix, labels_fold) joblib.dump(dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % fold)) joblib.dump(vectorizer, os.path.join(args.output_dir, 'vectorizer-fold%d.pkl' % fold)) labels_pred = dtree.predict(data_matrix) score = metrics.accuracy_score(labels_fold, labels_pred) print("Accuracy on fold %d (train set): %.5f" % (fold, score)) tree.export_graphviz(dtree, out_file= os.path.join(args.output_dir, 'tree-fold%d.dot' % fold), max_depth=5)#, feature_names = master_in.values) elif args.fold_number == 0: # use all the data to train the classifier print("Training classifier for the full set...\n") data_matrix = vectorizer.fit_transform(token_container) import ipdb; ipdb.set_trace() dtree.fit(data_matrix, labels_orig) joblib.dump(dtree, os.path.join(args.output_dir, 'tree.pkl')) joblib.dump(vectorizer, os.path.join(args.output_dir, 'vectorizer.pkl')) tree.export_graphviz(dtree, out_file= os.path.join(args.output_dir, 'tree.dot'), max_depth=5)#, feature_names = master_in.values) else: # create classifier for a particular fold fold = args.fold_number print("Training classifier for fold %d...\n" % fold) cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0] data_fold = [token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold] labels_fold = [labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold] data_matrix = vectorizer.fit_transform(data_fold) dtree.fit(data_matrix, labels_fold) joblib.dump(dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % args.fold_number)) joblib.dump(vectorizer, os.path.join(args.output_dir, 'vectorizer-fold%d.pkl' % args.fold_number)) labels_pred = dtree.predict(data_matrix) score = metrics.accuracy_score(labels_fold, labels_pred) print("Accuracy on fold %d (train set): %.5f" % (fold, score))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed # this is the default input directory if nothing is passed INPUT_HS = os.path.join("..", "..", "data", "features", "data_frame_hierarchy.csv") INPUT_HT = os.path.join("..", "..", "data", "features", "hierarchical-tokens.csv") OUTPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-features.pkl") parser.add_argument( '--hs', '--hierarchical-split', type=str, dest='input_hs', default=INPUT_HS, help= "A dataframe file containing the hierarchical level tokens for each item in the dataset. Defaults to '%(default)s'" ) parser.add_argument( '--ht', '--hierarchical-tokens', type=str, dest='input_ht', default=INPUT_HT, help= "A file containing the selected hierarchical level tokens. Defaults to '%(default)s'" ) parser.add_argument( '-o', '-output-file', type=str, dest='output_file', default=OUTPUT_FILE, help="A file to output the dictionary. Defaults to '%(default)s'") args = parser.parse_args() # Read the data data_hierarchy = pandas.read_csv(args.input_hs, encoding="ISO-8859-1") # Read the tokens hierarchy_tokens = pandas.read_csv(args.input_ht, header=None)[0] # Remove samples from classes which are noisy and not useful ("I", "IO") data_hierarchy = data_hierarchy[data_hierarchy.type_class != 'I'] data_hierarchy = data_hierarchy[data_hierarchy.type_class != 'IO'] columns_hierarchy = data_hierarchy.columns descr_hierarchy = data_hierarchy[columns_hierarchy[2:]] #import ipdb; ipdb.set_trace() hierarchy_token_container = [] for row in descr_hierarchy.iterrows(): row_dict = {} # initialize the rowlist = list(row[1]) rowlist = [x for x in rowlist if str(x) != 'nan'] rowlist = [x.strip() for x in rowlist] #remove spaces rowlist = [x.strip(string.punctuation) for x in rowlist] # remove punctuation for token in hierarchy_tokens: try: ind = rowlist.index(token) if ind <= 3: row_dict[token] = rowlist.index(token) else: row_dict[token] = 4 except ValueError: pass hierarchy_token_container += [row_dict] # Extracting the sample classes into more convenient format type_class_list = list(data_hierarchy['type_class']) source_class_list = list(data_hierarchy['source_class']) # Save everything alltogether in a pkl file to_dump = [type_class_list, source_class_list, hierarchy_token_container] helpers.ensure_dir(os.path.dirname(args.output_file)) pickle.dump(to_dump, open(args.output_file, "wb"))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) INPUT_FILE = os.path.join("..", "..", "data", "features", "bow_string_input_dframe.p") OUTPUT_FILE = os.path.join("..", "..", "data", "output", "predicted_labels_ensemble.pkl") OUTPUT_FILE_DESC = os.path.join("..", "..", "data", "output", "predicted_labels_ensemble.csv") parser.add_argument( "-i", "--input-file", type=str, dest="input_file", default=INPUT_FILE, help="File with the list of item classes and features. Defaults to '%(default)s'", ) parser.add_argument( "-g", "--gridres-file", type=str, dest="gridres_file", default=None, nargs="+", help="Files with the best parameters of the classifiers in the ensemble. If None, default parameters will be used. The number of files should correspond to the number of classifiers.", ) parser.add_argument( "-n", "-ngrams", dest="nGrams", type=int, default=1, nargs="+", help="Defines how to split words by ngrams. Default is tokenized to one word ngrams", ) parser.add_argument( "--ti", dest="tf_idf", action="store_true", default=False, help="Boolean - If set, TfIdf features will be used" ) parser.add_argument( "--cat", type=str, dest="category", default="income-type", choices=("income-type", "income-source", "expenditure-type"), help="The type of categorization. Defaults to '%(default)s'", ) parser.add_argument( "-c", "--classifiers", type=str, dest="classifiers", nargs="+", default=["decision-tree", "logistic-regression"], help="The classifiers to be used. More than one classifier can be used. The number of classifier should correspond to the number of grid-search parameter files.", ) parser.add_argument( "-o", "-output-file", type=str, dest="output_file", default=OUTPUT_FILE, help="A pickle file to output the predicted labels. Defaults to '%(default)s'", ) parser.add_argument( "--od", "--output-file-desc", type=str, dest="output_file_desc", default=OUTPUT_FILE_DESC, help="A csv file to output the predicted labels. Defaults to '%(default)s'", ) args = parser.parse_args() # Read the input dictionary data_in = pandas.read_pickle(args.input_file) type_classes = list(data_in["type_class"]) source_classes = list(data_in["source_class"]) frID = list(data_in["frID"]) data_orig = data_in["description"] if args.category == "income-type" or args.category == "expenditure-type": labels_orig = [str(i) for i in type_classes] # converting them to strings if they are not strings already else: labels_orig = [str(i) for i in source_classes] # converting them to strings if they are not strings already clfs = [] for i in range(len(args.classifiers)): if args.classifiers[i] == "decision-tree": gridres_params = pickle.load(open(args.gridres_file[i], "rb"))[0] max_depth = gridres_params["clf__max_depth"] if gridres_params is not None else 100 clfs += [tree.DecisionTreeClassifier(max_depth=max_depth, criterion="entropy")] print("Include a decision tree with max depth=%d" % max_depth) if args.classifiers[i] == "random-forest": gridres_params = pickle.load(open(args.gridres_file[i], "rb"))[0] max_depth = gridres_params["clf__max_depth"] if gridres_params is not None else 100 clfs += [ensemble.RandomForestClassifier(max_depth=max_depth, criterion="entropy")] print("Include a random forest with max depth=%d" % max_depth) if args.classifiers[i] == "logistic-regression": gridres_params = pickle.load(open(args.gridres_file[i], "rb"))[0] C = gridres_params["clf__C"] if gridres_params is not None else 1 clfs += [linear_model.LogisticRegression(C=C)] print("Include a logistic regressor with C=%d" % C) clf = ensemble_classifier.EnsembleClassifier(clfs=clfs, voting="hard") vectorizer = feature_extraction.text.CountVectorizer( analyzer="word", # whether should be made ofword or char n-grams binary=False, # if True all non-zero counts are set to one - used for probabilistic mapping decode_error="strict", # Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding # dtype='numpy.int64', # Type of the matrix returned by fit_transform() or transform() encoding="ISO-8859-15", # input="content", # can be 'file', 'filename' or 'content' lowercase=False, # Convert all characters to lowercase before tokenizing. max_df=1.0, # When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None." max_features=None, # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None. ngram_range=( 1, args.nGrams, ), # The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. preprocessor=None, # Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. stop_words=None, # min_df=1, strip_accents=None, token_pattern="(?u)\\b\\w\\w+\\b", tokenizer=None, vocabulary=None, ) if args.tf_idf == True: transformer = feature_extraction.text.TfidfTransformer() ppl = pipeline.Pipeline([("vectorizer", vectorizer), ("transformer", transformer), ("clf", clf)]) else: ppl = pipeline.Pipeline([("vectorizer", vectorizer), ("clf", clf)]) k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True) labels_predicted = [-1] * len(labels_orig) accuracy = [] for train_idx, dev_idx in k_fold: data_train = [data_orig[i] for i in train_idx] data_dev = [data_orig[i] for i in dev_idx] labels_train = [labels_orig[i] for i in train_idx] labels_dev = [labels_orig[i] for i in dev_idx] ppl.fit(data_train, labels_train) predicted_dev = ppl.predict(data_dev) labels_predicted = set_all_predicted(predicted_dev, labels_predicted, dev_idx) accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)] print("Accuracy of the ensemble classifier: %.4f +- %.4f" % (numpy.mean(accuracy), numpy.std(accuracy))) # Save the predicted classes to_dump = [labels_orig, labels_predicted] helpers.ensure_dir(os.path.dirname(args.output_file)) pickle.dump(to_dump, open(args.output_file, "wb")) # create a dataframe to output type class, predicted type class and description data if args.category == "income-type" or args.category == "expenditure-type": dump_op_desc = pandas.DataFrame( { "frID": frID, "type_class": labels_orig, "type_class_predicted": labels_predicted, "description": data_orig, } ) else: dump_op_desc = pandas.DataFrame( { "frID": frID, "source_class": labels_orig, "source_class_predicted": labels_predicted, "description": data_orig, } ) dump_op_desc.to_csv(args.output_file_desc)
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl") CLASSIFIER_DIR = os.path.join("..", "..", "data", "classifiers", "decision_trees") FOLD_FILE = os.path.join("..", "..", "data", "folds", "newfolds.csv") OUTPUT_DIR = os.path.join("..", "..", "data", "scores", "decision_trees") parser.add_argument( "-i", "--input-file", type=str, dest="input_file", default=INPUT_FILE, help="File with the list of item classes and features. Defaults to '%(default)s'", ) parser.add_argument( "--fn", "--fold-number", dest="fold_number", default=None, type=int, help="The fold number to be used for predicition. If 0, all the items will be considered. If None, will train the classifier for all the folds. Defaults to '%(default)s'", ) parser.add_argument( "--ff", "--fold-file", type=str, dest="fold_file", default=FOLD_FILE, help="Fold file containing the cross-fold validation indices. Defaults to '%(default)s'", ) parser.add_argument( "--cd", "--classifier-dir", type=str, dest="classifier_dir", default=CLASSIFIER_DIR, help="Directory where the trained decision trees are stored. Defaults to '%(default)s'", ) # parser.add_argument('--sm', # '--sparse-matrix', # dest='sparse_matrix', # action='store_true', # default=False, # help='If set, the data will be transformed into a sparse matrix') parser.add_argument( "-o", "--output-dir", type=str, dest="output_dir", default=OUTPUT_DIR, help="Directory to save the scores. Defaults to '%(default)s'", ) args = parser.parse_args() # Read the input dictionary type_classes, source_classes, token_container = pickle.load(open(args.input_file, "rb")) type_dict = {"IGI": 0, "IC": 1, "IV": 2, "IG": 3} # get all the label data labels_orig = [type_dict[x] for x in type_classes] # Create output directory helpers.ensure_dir(args.output_dir) # Read the fold indices cross_fold_indices = pandas.read_csv(args.fold_file, header=None)[0] if args.fold_number == None: # loop over all folds and create the classifier print("Evaluation will be done iteratively for all folds...\n") pred_labels = {} for fold in cross_fold_indices.unique(): print("Evaluating fold %d...\n" % fold) # read the vectorizer and the decision tree vectorizer = joblib.load(os.path.join(args.classifier_dir, "vectorizer-fold%d.pkl" % (fold))) dtree = joblib.load(os.path.join(args.classifier_dir, "tree-fold%d.pkl" % (fold))) # read and transformthe data data_fold = [token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold] data_matrix = vectorizer.transform(data_fold) pred_labels[fold] = dtree.predict(data_matrix) elif args.fold_number == 0: # use all the data to train the classifier print("Evaluation will be done on the full set...\n" % fold) # read the vectorizer and decision tree vectorizer = joblib.load(os.path.join(args.classifier_dir, "vectorizer.pkl")) dtree = joblib.load(os.path.join(args.classifier_dir, "tree.pkl")) # read the data data_matrix = vectorizer.transform(token_container) pred_labels = dtree.predict(data_matrix) else: # create classifier for a particular fold import ipdb ipdb.set_trace() fold = args.fold_number print("Evaluation for fold %d...\n" % fold) # read the vectorizer and the decision tree vectorizer = joblib.load(os.path.join(args.classifier_dir, "vectorizer-fold%d.pkl" % (fold))) dtree = joblib.load(os.path.join(args.classifier_dir, "tree-fold%d.pkl" % (fold))) # read and transformthe data data_fold = [token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold] data_matrix = vectorizer.transform(data_fold) pred_labels = dtree.predict(data_matrix) # do the evaluation if args.fold_number == None: # we iterate over all the folds for fold in cross_fold_indices.unique(): labels_fold = [labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold] pred_labels_fold = pred_labels[fold] score = metrics.accuracy_score(labels_fold, pred_labels_fold) print("Accuracy on fold %d: %.5f" % (fold, score)) elif args.fold_number == 0: pass else: fold = args.fold_number cross_fold_indices = pandas.read_csv(args.fold_file, header=None)[0] labels_fold = [labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold] score = metrics.accuracy_score(labels_fold, pred_labels) print("Accuracy on fold %d: %.5f" % (fold, score)) print("Done!\n")
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) INPUT_FILE = os.path.join("..", "..", "data", "features", "bow_string_input_dframe.p") OUTPUT_FILE_DESC = os.path.join("..", "..", "data", "output", 'predicted_labels_ensemble.csv') parser.add_argument('-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help="File with the list of item classes and features. Defaults to '%(default)s'") parser.add_argument('-g', '--gridres-file', type=str, dest='gridres_file', default=None, help="File with the best parameters of the grid search. Defaults to '%(default)s'") parser.add_argument('-n', '-ngrams', dest = 'nGrams', type = int, default = 1, nargs='+', help = 'Defines how to split words by ngrams. Default is tokenized to one word ngrams' ) parser.add_argument('--ti', dest='tf_idf', action='store_true', default=False, help='Boolean - If set, TfIdf features will be used') parser.add_argument('-c', '--classifier', type=str, dest='classifier', default="ensemble", help="The classifier to be used. Defaults to '%(default)s'") parser.add_argument('--cat', type=str, dest='category', default='income-type', choices=('income-type','income-source','expenditure-type'), help="The type of categorization. Defaults to '%(default)s'") parser.add_argument('--od', '--output-file-desc', type=str, dest='output_file_desc', default=OUTPUT_FILE_DESC, help="A csv file to output the predicted labels. Defaults to '%(default)s'") args = parser.parse_args() # Read the input dictionary data_in = pandas.read_pickle(args.input_file) type_classes = list(data_in['type_class']) source_classes = list(data_in['source_class']) frID = list(data_in['frID']) data_orig = data_in['description'] if args.category == 'income-type' or args.category == 'expenditure-type': labels_orig = [str(i) for i in type_classes] # converting them to strings if they are not strings already else: labels_orig = [str(i) for i in source_classes] # converting them to strings if they are not strings already if args.gridres_file is not None: gridres_params = pickle.load(open(args.gridres_file, "rb" ))[0] else: gridres_params = None if args.classifier == 'decision-tree': max_depth = gridres_params['clf__max_depth'] if gridres_params is not None else 100 clf = tree.DecisionTreeClassifier(max_depth=max_depth, criterion='entropy') print("Will run a decision tree with max depth=%d" % max_depth) elif args.classifier == 'random-forest': max_depth = gridres_params['clf__max_depth'] if gridres_params is not None else 100 clf = ensemble.RandomForestClassifier(max_depth=max_depth, criterion='entropy') print("Will run a random forest with max depth=%d" % max_depth) elif args.classifier == 'logistic-regression': C = gridres_params['clf__C'] if gridres_params is not None else 1 clf = linear_model.LogisticRegression(C=C) print("Will run logistic regressor with C=%d" % C) else: # ensemble clf1 = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy') clf2 = ensemble.RandomForestClassifier() clf3 = linear_model.LogisticRegression() clf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='hard') vectorizer = feature_extraction.text.CountVectorizer( analyzer='word', #whether should be made ofword or char n-grams binary=False, # if True all non-zero counts are set to one - used for probabilistic mapping decode_error= 'strict', # Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding #dtype='numpy.int64', # Type of the matrix returned by fit_transform() or transform() encoding="ISO-8859-15", # input='content', # can be 'file', 'filename' or 'content' lowercase=False, #Convert all characters to lowercase before tokenizing. max_df=1.0, # When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None." max_features=None, # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None. ngram_range=(1, args.nGrams), # The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. preprocessor=None, # Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. stop_words=None, # min_df=1, strip_accents=None, token_pattern = '(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None ) if args.tf_idf == True: transformer = feature_extraction.text.TfidfTransformer() ppl = pipeline.Pipeline([ ('vectorizer', vectorizer), ('transformer', transformer), ('clf', clf), ]) else: ppl = pipeline.Pipeline([ ('vectorizer', vectorizer), ('clf', clf), ]) k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True) labels_predicted = [-1] * len(labels_orig) accuracy = [] for train_idx, dev_idx in k_fold: data_train = [data_orig[i] for i in train_idx] data_dev = [data_orig[i] for i in dev_idx] labels_train = [labels_orig[i] for i in train_idx] labels_dev = [labels_orig[i] for i in dev_idx] ppl.fit(data_train, labels_train) predicted_dev = ppl.predict(data_dev) labels_predicted = set_all_predicted(predicted_dev, labels_predicted, dev_idx) accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)] print("Accuracy of the %s classifier: %.4f +- %.4f" % (args.classifier, numpy.mean(accuracy), numpy.std(accuracy))) # Save the predicted classes to_dump = [labels_orig, labels_predicted] helpers.ensure_dir(os.path.dirname(args.output_file_desc)) #create a dataframe to output type class, predicted type class and description data if args.category == 'income-type' or args.category == 'expenditure-type': dump_op_desc = pandas.DataFrame({'frID': frID, 'type_class': labels_orig, 'type_class_predicted': labels_predicted, 'description': data_orig}) else: dump_op_desc = pandas.DataFrame({'frID': frID, 'source_class': labels_orig, 'source_class_predicted': labels_predicted, 'description': data_orig}) dump_op_desc.to_csv(args.output_file_desc)
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed # this default file still contains I and IO values so needs updating INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchical", "new", "dframe_iTrain.csv") OUTPUT_DIR = os.path.join("..", "..", "data", "features") OUTPUT_FILENAME = 'temp/data_stri_Please_rename_or_delete_me.p' parser.add_argument('--ad', '--addData', dest='addData', action='store_false', default=True, help='adds charity type and name to description list') parser.add_argument( '-i', '-input-file', type=str, dest='inputFile', default=INPUT_FILE, help= 'Dataframe to be processed to a Bag of words. Defaults to ../../data/features/data_frame_test.csv.' ) parser.add_argument( '--lc', '--lastCells', dest='lastCells', type=int, default=0, help= 'If set will extract the n last cells from the dataframe according to the input number. For use with source_class it needs a hierarchically tokenized dataframe' ) parser.add_argument( '--hi', '--hierarchy', dest='hierarchy', action='store_true', default=False, help='If set spaces will be removed from within hierarchical items') parser.add_argument( '--wl', '--wordsForLast', dest='wordsForLast', action='store_true', default=False, help= 'Do not concatonate the last hierarchical item so that individual words are kept' ) parser.add_argument('--od', '-output-dir', type=str, dest='output_dir', default=OUTPUT_DIR, help="Directory to save the output") parser.add_argument( '-o', '-output-file', type=str, dest='outputFilename', default=OUTPUT_FILENAME, help="Filename of output file - must be a pickle.p format") args = parser.parse_args() data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-15") data_in_str = pandas.read_csv(args.inputFile, encoding="ISO-8859-15", dtype=str) # data_in = data_in[data_in.type_class != 'I'] # data_in = data_in[data_in.type_class != 'IO'] data_stri = data_in_str[['frID', 'type_class', 'source_class']] data_stri['description'] = 'Nan' sep = " " if args.hierarchy: print('in hierarchy separation mode') token_data = data_in.iloc[0:, 5:] for i in range(0, len(token_data)): row = token_data.iloc[i, 0:] for j in range(0, len(row)): token = row.iloc[j] if args.wordsForLast: if j < (len(row) - 1): nextToken = row.iloc[j + 1] if isinstance(nextToken, float): if not math.isnan(nextToken): if type(token) == str: token = token.replace(" ", "") else: if type(token) == str: token = token.replace(" ", "") else: if type(token) == str: token = token.replace(" ", "") row[j] = token token_data.iloc[i] = row data_in.iloc[0:, 5:] = token_data #.iloc[0:,0:] if args.addData: charity_type = data_in_str.iloc[0:, 3] charity_name = data_in_str.iloc[0:, 4] if args.lastCells == 0: # effectively processing type_class print('last cells equals 0') token_data = data_in.iloc[0:, 5:] i = 0 for row in range(0, len(token_data)): token_data_stri = sep.join( map(str, token_data.iloc[row, 0:].dropna())) if args.addData: collapsedname = charity_name.iloc[row].replace(" ", "") collapsedtype = charity_type.iloc[row].replace(" ", "") all_description = sep.join( [token_data_stri, collapsedname, collapsedtype]) data_stri.iloc[row, 3] = all_description i = i + 1 if i % 100 == 0: print('row %d' % i) else: data_stri.iloc[row, 3] = token_data_stri else: # effectively processing for source class as this selects the last n # segments of the hierachy - needs to be passed the hierachical dataframe #print('last cells equals ', lastCells) for row in range(0, len(data_in)): print(row) all_cells = list() for col in data_in.iloc[int(row), 5:]: #print("col: ", col) all_cells.append(col) all_cells = [x for x in all_cells if str(x) != 'nan'] data_stri.iloc[int(row), 3] = sep.join( all_cells[len(all_cells) - args.lastCells:len(all_cells)]) helpers.ensure_dir(args.output_dir) data_stri.to_pickle(os.path.join(args.output_dir, args.outputFilename))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl") CLASSIFIER_DIR = os.path.join("..", "..", "data", "classifiers", "decision_trees") FOLD_FILE = os.path.join("..", "..", "data", "folds", "newfolds.csv") OUTPUT_DIR = os.path.join("..", "..", "data", "scores", "decision_trees") parser.add_argument( '-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help= "File with the list of item classes and features. Defaults to '%(default)s'" ) parser.add_argument( '--fn', '--fold-number', dest='fold_number', default=None, type=int, help= "The fold number to be used for predicition. If 0, all the items will be considered. If None, will train the classifier for all the folds. Defaults to '%(default)s'" ) parser.add_argument( '--ff', '--fold-file', type=str, dest='fold_file', default=FOLD_FILE, help= "Fold file containing the cross-fold validation indices. Defaults to '%(default)s'" ) parser.add_argument( '--cd', '--classifier-dir', type=str, dest='classifier_dir', default=CLASSIFIER_DIR, help= "Directory where the trained decision trees are stored. Defaults to '%(default)s'" ) #parser.add_argument('--sm', # '--sparse-matrix', # dest='sparse_matrix', # action='store_true', # default=False, # help='If set, the data will be transformed into a sparse matrix') parser.add_argument( '-o', '--output-dir', type=str, dest='output_dir', default=OUTPUT_DIR, help="Directory to save the scores. Defaults to '%(default)s'") args = parser.parse_args() # Read the input dictionary type_classes, source_classes, token_container = pickle.load( open(args.input_file, "rb")) type_dict = {'IGI': 0, 'IC': 1, 'IV': 2, 'IG': 3} # get all the label data labels_orig = [type_dict[x] for x in type_classes] # Create output directory helpers.ensure_dir(args.output_dir) # Read the fold indices cross_fold_indices = pandas.read_csv(args.fold_file, header=None)[0] if args.fold_number == None: # loop over all folds and create the classifier print("Evaluation will be done iteratively for all folds...\n") pred_labels = {} for fold in cross_fold_indices.unique(): print("Evaluating fold %d...\n" % fold) # read the vectorizer and the decision tree vectorizer = joblib.load( os.path.join(args.classifier_dir, 'vectorizer-fold%d.pkl' % (fold))) dtree = joblib.load( os.path.join(args.classifier_dir, 'tree-fold%d.pkl' % (fold))) # read and transformthe data data_fold = [ token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold ] data_matrix = vectorizer.transform(data_fold) pred_labels[fold] = dtree.predict(data_matrix) elif args.fold_number == 0: # use all the data to train the classifier print("Evaluation will be done on the full set...\n" % fold) # read the vectorizer and decision tree vectorizer = joblib.load( os.path.join(args.classifier_dir, 'vectorizer.pkl')) dtree = joblib.load(os.path.join(args.classifier_dir, 'tree.pkl')) #read the data data_matrix = vectorizer.transform(token_container) pred_labels = dtree.predict(data_matrix) else: # create classifier for a particular fold import ipdb ipdb.set_trace() fold = args.fold_number print("Evaluation for fold %d...\n" % fold) # read the vectorizer and the decision tree vectorizer = joblib.load( os.path.join(args.classifier_dir, 'vectorizer-fold%d.pkl' % (fold))) dtree = joblib.load( os.path.join(args.classifier_dir, 'tree-fold%d.pkl' % (fold))) # read and transformthe data data_fold = [ token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold ] data_matrix = vectorizer.transform(data_fold) pred_labels = dtree.predict(data_matrix) # do the evaluation if args.fold_number == None: # we iterate over all the folds for fold in cross_fold_indices.unique(): labels_fold = [ labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold ] pred_labels_fold = pred_labels[fold] score = metrics.accuracy_score(labels_fold, pred_labels_fold) print("Accuracy on fold %d: %.5f" % (fold, score)) elif args.fold_number == 0: pass else: fold = args.fold_number cross_fold_indices = pandas.read_csv(args.fold_file, header=None)[0] labels_fold = [ labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold ] score = metrics.accuracy_score(labels_fold, pred_labels) print("Accuracy on fold %d: %.5f" % (fold, score)) print("Done!\n")
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_HT = os.path.join("..", "..", "data", "features", "data_frame_hierarchy.csv") INPUT_WT = os.path.join("..", "..", "data", "features", "data_frame_words.csv") OUTPUT_DIR = os.path.join("..", "..", "data", "features") parser.add_argument('--ht', '--hierarchical-tokens-file', type=str, dest='input_ht', default=INPUT_HT, help="A dataframe file containing the hierarchical level tokens for each item in the dataset. Defaults to '%(default)s'") parser.add_argument('--wt', '--word-tokens-file', type=str, dest='input_wt', default=INPUT_WT, help="A dataframe file containing the word tokens for each item in the dataset. Defaults to '%(default)s'") parser.add_argument('--nh', '--num-hierarchy-tokens', dest='num_hierarchy_tokens', default=20, type=int, help="The number of the most frequent hierarchical tokens to use per class. Defaults to '%(default)s'") parser.add_argument('--nw', '--num-word-tokens', dest='num_word_tokens', default=20, type=int, help="The number of the most frequent hierarchical tokens to use per class. Defaults to '%(default)s'") parser.add_argument('-o', '--output-dir', type=str, dest='output_dir', default=OUTPUT_DIR, help="Directory to save the selected tokens. Defaults to '%(default)s'") args = parser.parse_args() data_hierarchy = pandas.read_csv(args.input_ht, encoding="ISO-8859-1") data_words = pandas.read_csv(args.input_wt, encoding="ISO-8859-1") # Remove samples from classes which are noisy and not useful ("I", "IO") data_hierarchy = data_hierarchy[data_hierarchy.type_class != 'I'] data_hierarchy = data_hierarchy[data_hierarchy.type_class != 'IO'] data_words = data_words[data_words.type_class != 'I'] data_words = data_words[data_words.type_class != 'IO'] # Read the available classes and labels from the data labels_hierarchy = data_hierarchy.type_class.unique() labels_words = data_words.type_class.unique() columns_hierarchy = data_hierarchy.columns columns_words = data_hierarchy.columns hierarchy_tokens = [] # Finding the N most frequent hierarchy tokens print("Processing hierarchical tokens...") for label in labels_hierarchy: print("Processing %s class..." % label) data_by_label = data_hierarchy[data_hierarchy.type_class == label] # filter only the items with the particular label descr = data_by_label[columns_hierarchy[2:]] # take only the columns that represent description hierarchy_tokens += find_most_common_tokens(descr, args.num_hierarchy_tokens) word_tokens = [] # Finding the N most frequent word tokens print("Processing word tokens...") for label in labels_words: print("Processing %s class..." % label) data_by_label = data_words[data_words.type_class == label] # filter only the items with the particular label descr = data_by_label[columns_words[2:]] # take only the columns that represent description word_tokens += find_most_common_tokens(descr, args.num_word_tokens) #Save the tokens import ipdb; ipdb.set_trace() helpers.ensure_dir(args.output_dir) hierarchy_tokens_towrite = pandas.Series(pandas.Series(numpy.array(hierarchy_tokens)).unique()) # take each token just once hierarchy_tokens_towrite.to_csv(os.path.join(args.output_dir, 'hierarchy_tokens.csv'), index=False) # write to file, but don't give row names word_tokens_towrite = pandas.Series(pandas.Series(numpy.array(word_tokens)).unique()) word_tokens_towrite.to_csv(os.path.join(args.output_dir, 'word_tokens.csv'), index=False) # write to file, but don't give row names
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl") OUTPUT_FILE = os.path.join("..", "..", "data", "params", 'params-decisiontree.pkl') parser.add_argument( '-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help= "File with the list of item classes and features. Defaults to '%(default)s'" ) parser.add_argument( '-c', '--classifier', type=str, dest='classifier', default="decision-tree", help="The classifier to be used. Defaults to '%(default)s'") parser.add_argument( '-o', '-output-file', type=str, dest='output_file', default=OUTPUT_FILE, help="A file to output the predicted labels. Defaults to '%(default)s'" ) args = parser.parse_args() # Read the input dictionary type_classes, source_classes, token_container = pickle.load( open(args.input_file, "rb")) type_dict = {'IGI': 0, 'IC': 1, 'IV': 2, 'IG': 3} # get all the label data labels_orig = [type_dict[x] for x in type_classes] data_orig = token_container if args.classifier == 'decision-tree': import ipdb ipdb.set_trace() clf = tree.DecisionTreeClassifier(criterion='entropy') parameters = { 'clf__max_depth': [i for i in range(25, 200, 25)], } elif args.classifier == 'random-forest': clf = ensemble.RandomForestClassifier(criterion='entropy') parameters = { 'clf__max_depth': [i for i in range(25, 200, 25)], } else: #'logistic-regression': clf = linear_model.LogisticRegression() parameters = { 'clf__C': [0.5, 1, 5, 10], } ppl = pipeline.Pipeline([ ('vectorizer', feature_extraction.DictVectorizer(sparse=True)), #sparse=True ('clf', clf), ]) gs = grid_search.GridSearchCV(ppl, parameters, verbose=1, cv=5) gs.fit(data_orig, labels_orig) print(gs.best_params_, gs.best_score_) helpers.ensure_dir(os.path.dirname(args.output_file)) pickle.dump([gs.best_params_, gs.best_params_], open(args.output_file, "wb"))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "iTrain_extra_lastitem_mod_new.csv") #iTrain.csv FOLD_FILE = os.path.join("..", "..", "data", "folds", "iTrain_fold.csv") #iTrain.csv OUTPUT_FILE = os.path.join("..", "..", "data", "features","new", "dframe_new_iTrain.csv")# BAD_OUTPUT_FILE = os.path.join("..", "..", "data", "features","welsh_iTrain.csv")# SPELL_CHECKER_PATH = os.path.join("..","..","data","big.txt") parser.add_argument('-i', '-input-file', type=str, dest='inputFile', default=INPUT_FILE, help='File to be processed to a master set. File must be saved in data and argument structured as - ../../data/yourfilename.csv' ) parser.add_argument('---lan', '---language', dest='language', action='store_false', default=False, help='Boolean - If set, language will be determined and non-english items will be removed') parser.add_argument('-l', '-lemmatize', dest='lemmatize', action='store_true', default=False, help='Boolean - If set, verbs will be lemmatized') parser.add_argument('--la', '--lemmatizeall', dest='lemmatizeall', action='store_true', default=False, help='Boolean - If set, all words will be lemmatized') parser.add_argument('--lc', '--lower-case', dest='lowerCase', action='store_false', default=True, help='Boolean - Defaults to converted all to lower-case') parser.add_argument('--rw', '--remove-words', dest='removeTags', nargs= '+', #action='store_true', default=None, help='Accepts a list of types of words to be removed from list of ADJ, ADV, CNJ, DET, EX, FW, MOD, N, NP, NUM, PRO, P, TO, UH, V, VD, VG, VN, WH') parser.add_argument('-s', '-stematize', dest='stematize', action='store_true', default=False, help='Boolean - If set, all words will be stematized') parser.add_argument('--sa', '--strip-accents', dest='stripAccents', action='store_false', default=True, help="Removes accents on letters replacing them with just the letter itself") parser.add_argument('--sp', '--spelling-corrector', dest='spellCorrect', action='store_false', default=True, help="Correct spelling mistakes word by word, just taking the most likely correction") parser.add_argument('--sd', '--spell-dictionary', dest='spell_dictionary', default=SPELL_CHECKER_PATH, help="File containing the dictionary to be used for spell-check. Defaults to '%(default)s'") parser.add_argument('--sw', '--stop-words', dest='stopWords', action='store_false', default=True, help='Removes the most common words, "stop words", from the text') # parser.add_argument('-t', # '-tokenize', # dest='tokenize', # action='store_true', # default=False, # help='Tokenizes text to individual words') parser.add_argument('--ta', '--alpha-numeric', dest='alphaNumeric', action='store_true', default=True, help='Boolean - If NOT set file will be tokenized and non alpha-numeric words left in. Default is TRUE') parser.add_argument('--th', '--token-hyphen', dest='tokenHyphen', action='store_true', default=False, help='Tokenizes text using the directory structure from input file') parser.add_argument('--uc', '--upper-case', dest='upperCase', action='store_true', default=False, help='Boolean - If set, all words will be converted to upper-case') parser.add_argument('--fn', '--fold-number', dest='fold_number', default=0, type=int, help="The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered . Defaults to '%(default)s'") parser.add_argument('--ff', '--fold-file', type=str, dest='foldFile', default=FOLD_FILE, help="Fold file containing the cross-fold validation indices. Defaults to '%(default)s'") parser.add_argument('-o', '-output-file', type=str, dest='output_file', default=OUTPUT_FILE, help="Directory to be used to save the created master set. Filename will be automatically created based on input flags. Defaults to something needsto go here") parser.add_argument('--bo', '--bad-output-file', type=str, dest='bad_output_file', default=BAD_OUTPUT_FILE, help="Directory to be used to save the created welsh data set. Filename will be automatically created based on input flags. Defaults to something needsto go here") args = parser.parse_args() # data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-1") data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-15", dtype=str) #set column names? words = pandas.DataFrame({'frID':data_in.frID, 'type':data_in.type_class, 'class': data_in.source_class, 'description': data_in.description, 'ICNPO_category':data_in.ICNPO_category, 'nicename':data_in.nicename}) processed_data = pandas.DataFrame(data_in[['frID', 'type_class', 'source_class', 'ICNPO_category', 'nicename']]) #Define word_list word_list=words.description #print(words.head()) if args.language: #check that text is in english and separate print('Items that are more likely to be Welsh:') langval=text_processing.language(word_list) # for x in range(0, len(word_list)): good=numpy.where([x >= 0.01 for x in langval]) bad=numpy.where([x < 0.01 for x in langval]) badwords = words.drop(words.index[good]) words = words.drop(words.index[bad]) word_list=words.description # helpers.ensure_dir(os.path.dirname(args.bad_output_file)) badwords.to_csv(args.bad_output_file, index = False) # write to file, but don't give row names print('Only items with en < 0.01 are taken to be bad') #tokenize the text either straight or keeping only alpha-numeric(default) if args.alphaNumeric: # keep just the alpha-numeric characters from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') word_list = list(map(tokenizer.tokenize, word_list)) #print("alpha numeric only") #print(word_list[1:20]) # if args.tokenize: # word_list = list(map(tokenizer.tokenize, word_list)) # word_list = list(map(nltk.word_tokenize, words.description)) #print(word_list[1:20]) if args.tokenHyphen: #print("Tokenize on hyphen") word_list = list(map(text_processing.tokenize_on_hyphen , word_list)) #print(word_list[1:20]) # lower case if args.lowerCase: #print("LOWER CASE") word_list = list(map(text_processing.make_lower, word_list)) #print(word_list[1:20]) # Upper case if args.upperCase: #print("UPPER CASE") word_list = list(map(text_processing.make_upper, word_list)) #print(word_list[1:20]) if args.stripAccents: #print("CORRECT SPELLING") i=0 for x in range(len(word_list)) : correctedWords = [text_processing.strip_accents(y) for y in word_list[x]] word_list[x]= correctedWords i=i+1 # if i % 100 == 0 : print('row %d'% i) # word_list = list(map(text_processing.strip_accents, word_list)) #print(word_list[1:20]) if args.spellCorrect: #print("CORRECT SPELLING") word_list = spell_checker.correctall(word_list, args.spell_dictionary) #print(word_list[1:20]) if args.removeTags: #print("REMOVE WORDS") #needs function in text_processing word_list = list(map(text_processing.keep_only_specified_tags, word_list, args.removeTags)) if args.stopWords: #print("STOP WORDS") word_list = list(map(text_processing.exclude_stop_words, word_list)) #print(word_list[1:20]) if args.lemmatizeall: #print("LEMMATIZE ALL") word_list = list(map(text_processing.lemmatizeall, word_list)) #print(word_list[1:20]) if args.lemmatize: #print("LEMMATIZE") word_list = list(map(text_processing.lemmatize, word_list)) print(word_list[1:20]) if args.stematize: #print("STEMATIZE") word_list = list(map(text_processing.stematize, word_list)) #print(word_list[1:20]) wl_df = pandas.DataFrame(word_list) frames = [processed_data, wl_df] output_df = pandas.concat(frames, axis = 1) #print(output_df[1:20]) helpers.ensure_dir(os.path.dirname(args.output_file)) output_df.to_csv(args.output_file, index = False) # write to file, but don't give row names
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "iTrain.csv") OUTPUT_FILE = os.path.join("..", "..", "data", "folds", "iTrain.csv") parser.add_argument('-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help="File to be processed. Defaults to '%(default)s'") parser.add_argument('--ns', '--num-samples', dest='num_samples', default=None, type=int, help='The number of samples') parser.add_argument('--nf', '--num-folds', dest='num_folds', default=5, type=int, help='The number of folds') parser.add_argument( '-o', '--output-file', type=str, dest='output_file', default=OUTPUT_FILE, help= "File to be used to save the fold indices. Defaults to '%(default)s'") args = parser.parse_args() # read the data if args.num_samples == None: data_in = pandas.read_csv(args.input_file, encoding="ISO-8859-1") num_samples = len(data_in) else: num_samples = args.num_samples # generate the folds from random import shuffle indices_list = list(range(0, num_samples)) shuffle(indices_list) retval = [] folds_indices = [x % args.num_folds + 1 for x in indices_list] # write the indices to file helpers.ensure_dir(os.path.dirname(args.output_file)) folds_indices_towrite = pandas.Series(numpy.array(folds_indices)) folds_indices_towrite.to_csv( args.output_file, index=False) # write to file, but don't give row names
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl") OUTPUT_FILE = os.path.join("..", "..", "data", "output", 'predicted_labels_ensemble.pkl') parser.add_argument('-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help="File with the list of item classes and features. Defaults to '%(default)s'") parser.add_argument('-c', '--classifier', type=str, dest='classifier', default="ensemble", help="The classifier to be used. Defaults to '%(default)s'") parser.add_argument('--orig-labels', dest='orig_labels', action='store_true', default=False, help='Boolean - If set, the original data labels will be stored. Otherwise, they will be coded as integers') parser.add_argument('--cat', type=str, dest='category', default='income-type', choices=('income-type','income-source','expenditure-type'), help="The type of categorization. Defaults to '%(default)s'") parser.add_argument('-o', '-output-file', type=str, dest='output_file', default=OUTPUT_FILE, help="A file to output the predicted labels. Defaults to '%(default)s'") args = parser.parse_args() # Read the input dictionary type_classes, source_classes, token_container = pickle.load(open(args.input_file, "rb")) import ipdb; ipdb.set_trace() # get all the label data if args.category == 'income-type' or args.category == 'expenditure-type': labels_orig = [str(i) for i in type_classes] # converting them to strings if they are not strings already else: labels_orig = [str(i) for i in source_classes] # converting them to strings if they are not strings already #labels_orig = [type_dict[x] for x in type_classes] data_orig = token_container if args.classifier == 'decision-tree': clf = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy') elif args.classifier == 'random-forest': clf = ensemble.RandomForestClassifier() elif args.classifier == 'logistic-regression': clf = linear_model.LogisticRegression() else: # ensemble clf1 = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy') clf2 = ensemble.RandomForestClassifier() clf3 = linear_model.LogisticRegression() clf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='hard') ppl = pipeline.Pipeline([ ('vectorizer', feature_extraction.DictVectorizer(sparse=True)), #sparse=True ('clf', clf), ]) k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True) #labels_predicted = numpy.array([-1] * len(labels_orig), dtype='int') labels_predicted = [-1] * len(labels_orig) accuracy = [] for train_idx, dev_idx in k_fold: data_train = [data_orig[i] for i in train_idx] data_dev = [data_orig[i] for i in dev_idx] labels_train = [labels_orig[i] for i in train_idx] labels_dev = [labels_orig[i] for i in dev_idx] ppl.fit(data_train, labels_train) predicted_dev = ppl.predict(data_dev) #labels_predicted[dev_idx] = predicted_dev labels_predicted = set_all_predicted(predicted_dev, labels_predicted, dev_idx) accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)] print("Accuracy of the %s classifier: %.4f +- %.4f" % (args.classifier, numpy.mean(accuracy), numpy.std(accuracy))) # Save the predicted classes #inv_type_dict = {v: k for k, v in type_dict.items()} to_dump = [labels_orig, labels_predicted] helpers.ensure_dir(os.path.dirname(args.output_file)) pickle.dump(to_dump, open(args.output_file, "wb")) ''' predicted_type_classes = [inv_type_dict[x] for x in labels_predicted] if args.orig_labels: # save the original labels predicted_type_classes = [inv_type_dict[x] for x in labels_predicted] to_dump = [type_classes, predicted_type_classes] else: # save the labels codified with integer numbers, as well as the decoding dictionary to_dump = [labels_orig, labels_predicted, inv_type_dict] '''
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "test.csv") #iTrain.csv FOLD_FILE = os.path.join("..", "..", "data", "folds", "iTrain_fold.csv") #iTrain.csv OUTPUT_DIR = os.path.join("..", "..", "data", "test") SPELL_CHECKER_PATH = os.path.join("..","..","data","big.txt") parser.add_argument('-i', '-input-file', type=str, dest='inputFile', default=INPUT_FILE, help='File to be processed to a master set. Defaults to iTrain.csv. File must be saved in data and argument structured as - ../../data/yourfilename.csv' ) parser.add_argument('---lan', '---language', dest='language', action='store_true', default=False, help='Boolean - If set, language will be determined and non-english items will be removed') parser.add_argument('-l', '-lemmatize', dest='lemmatize', action='store_true', default=False, help='Boolean - If set, verbs will be lemmatized') parser.add_argument('--la', '--lemmatizeall', dest='lemmatizeall', action='store_true', default=False, help='Boolean - If set, all words will be lemmatized') parser.add_argument('--lc', '--lower-case', dest='lowerCase', action='store_false', default=True, help='Boolean - Defaults to converted all to lower-case') parser.add_argument('--rw', '--remove-words', dest='removeTags', nargs= '+', #action='store_true', default=None, help='Accepts a list of types of words to be removed from list of ADJ, ADV, CNJ, DET, EX, FW, MOD, N, NP, NUM, PRO, P, TO, UH, V, VD, VG, VN, WH') parser.add_argument('-s', '-stematize', dest='stematize', action='store_true', default=False, help='Boolean - If set, all words will be stematized') parser.add_argument('--sa', '--strip-accents', dest='stripAccents', action='store_false', default=True, help="Removes accents on letters replacing them with just the letter itself") parser.add_argument('--sp', '--spelling-corrector', dest='spellCorrect', action='store_false', default=True, help="Correct spelling mistakes word by word, just taking the most likely correction") parser.add_argument('--sw', '--stop-words', dest='stopWords', action='store_false', default=True, help='Removes the most common words, "stop words", from the text') # parser.add_argument('-t', # '-tokenize', # dest='tokenize', # action='store_true', # default=False, # help='Tokenizes text to individual words') parser.add_argument('--ta', '--alpha-numeric', dest='alphaNumeric', action='store_true', default=False, help='Boolean - If NOT set file will be tokenized and non alpha-numeric words left in. Default is TRUE') parser.add_argument('--th', '--token-hyphen', dest='tokenHyphen', action='store_true', default=False, help='Tokenizes text using the directory structure from input file') parser.add_argument('--uc', '--upper-case', dest='upperCase', action='store_true', default=False, help='Boolean - If set, all words will be converted to upper-case') parser.add_argument('--fn', '--fold-number', dest='fold_number', default=0, type=int, help="The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered . Defaults to '%(default)s'") parser.add_argument('--ff', '--fold-file', type=str, dest='foldFile', default=FOLD_FILE, help="Fold file containing the cross-fold validation indices. Defaults to '%(default)s'") parser.add_argument('-o', '-output-dir', type=str, dest='output_dir', default=OUTPUT_DIR, help="Directory to be used to save the created master set. Filename will be automatically created based on input flags. Defaults to something needsto go here") args = parser.parse_args() # data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-1") data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-15" ) words = pandas.DataFrame(data_in.description) # take just the description data # Filter the folds for master set training if args.fold_number != 0: # for creating the master set, keep just the words that DO NOT belong to the fold given as an argument cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0] words = words.loc[cross_fold_indices != args.fold_number] #Define word_list word_list=words.description #print(words.head()) if args.language: #check that text is in english and separate langval=text_processing.language(word_list) # for x in range(0, len(word_list)): good=numpy.where([x >= 0.8 for x in langval]) bad=numpy.where([x < 0.8 for x in langval]) # good=numpy.where([x=='en' for x in langval]) # bad=numpy.where([x!='en' for x in langval]) badwords = words.drop(words.index[good]) words = words.drop(words.index[bad]) word_list=words.description # # bad_output_fileName = "welsh_set.csv" bad_output_file = os.path.join(args.output_dir, bad_output_fileName) helpers.ensure_dir(os.path.dirname(args.output_dir)) badwords.to_csv(bad_output_file, index = False) # write to file, but don't give row names #tokenize the text either straight or keeping only alpha-numeric(default) if args.alphaNumeric: # keep just the alpha-numeric characters from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') word_list = list(map(tokenizer.tokenize, word_list)) #print("alpha numeric only") #print(word_list[1:20]) if args.tokenize: word_list = list(map(tokenizer.tokenize, word_list)) # word_list = list(map(nltk.word_tokenize, words.description)) #print(word_list[1:20]) if args.tokenHyphen: #print("Tokenize on hyphen") word_list = list(map(text_processing.tokenize_on_hyphen , word_list)) #print(word_list[1:20]) # lower case if args.lowerCase: #print("LOWER CASE") word_list = list(map(text_processing.make_lower, word_list)) #print(word_list[1:20]) # Upper case if args.upperCase: #print("UPPER CASE") word_list = list(map(text_processing.make_upper, word_list)) #print(word_list[1:20]) if args.stripAccents: #print("CORRECT SPELLING") i=0 for x in range(len(word_list)) : correctedWords = [text_processing.strip_accents(y) for y in word_list[x]] word_list[x]= correctedWords i=i+1 if i % 100 == 0 : print('row %d'% i) # word_list = list(map(text_processing.strip_accents, word_list)) #print(word_list[1:20]) if args.spellCorrect: #print("CORRECT SPELLING") word_list = spell_checker.correctall(word_list, SPELL_CHECKER_PATH) #print(word_list[1:20]) if args.removeTags: #print("REMOVE WORDS") #needs function in text_processing word_list = list(map(text_processing.keep_only_specified_tags, word_list, args.removeTags)) if args.stopWords: #print("STOP WORDS") word_list = list(map(text_processing.exclude_stop_words, word_list)) #print(word_list[1:20]) if args.lemmatizeall: #print("LEMMATIZE ALL") word_list = list(map(text_processing.lemmatizeall, word_list)) #print(word_list[1:20]) if args.lemmatize: #print("LEMMATIZE") word_list = list(map(text_processing.lemmatize, word_list)) print(word_list[1:20]) if args.stematize: #print("STEMATIZE") word_list = list(map(text_processing.stematize, word_list)) #print(word_list[1:20]) master_set = set() list(map(master_set.update, word_list)) # add unique values to the master set sorted(master_set) #ipdb.set_trace() helpers.ensure_dir(args.output_dir) #ipdb.set_trace() if args.fold_number!=0: output_fileName = "master_set_fold_%d.csv" % args.fold_number else: output_fileName = "master_set.csv" output_path = os.path.join(args.output_dir, output_fileName) master_set_towrite = pandas.Series(numpy.array(list(master_set))) master_set_towrite.to_csv(output_path, index=False) # write to file, but don't give row names
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "iTrain.csv") OUTPUT_FILE = os.path.join("..", "..", "data", "features", "data-frame.csv") parser.add_argument('--i', '--input-file', type=str, dest='inputFile', default=INPUT_FILE, help='File to be processed to boolean matrix') parser.add_argument('-l', '--lemmatize', dest='lemmatize', action='store_true', default=False, help='Boolean - If set, verbs will be lemmatized') parser.add_argument('-la', '--lemmatizeall', dest='lemmatizeall', action='store_true', default=False, help='Boolean - If set, all words will be lemmatized') parser.add_argument('-lc', '--lower-case', dest='lowerCase', action='store_true', default=True, help='Boolean - Defaults to converted all to lower-case') parser.add_argument('-rw', '--remove-words', dest='removeWords', action='store_true', default=None, help='Accepts a list of types of words to be removed e.g. ...') parser.add_argument('-s', '--stematize', dest='stematize', action='store_true', default=False, help='Boolean - If set, all words will be stematized') parser.add_argument('--sa', '--strip-accents', dest='stripAccents', action='store_true', default=False, help="Removes accents on letters replacing them with just the letter itself") parser.add_argument('--sp', '--spelling-corrector', dest='spellCorrect', action='store_true', default=False, help="Correct spelling mistakes word by word, just taking the most likely correction") parser.add_argument('--sw', '--stop-words', dest='stopWords', action='store_true', default=False, help='Removes the most common words, "stop words", from the text') parser.add_argument('-t', '-tokenize', dest='token', action='store_true', default=False, help='Tokenizes text to individual words') parser.add_argument('--ta', '--alpha-numeric', dest='alphaNumeric', action='store_false', default=True, help='Boolean - If NOT set file will be tokenized and non alpha-numeric words left in. Default is TRUE') parser.add_argument('--th', '--token-hyphen', dest='tokenHyphen', action='store_true', default=False, help='Tokenizes text using the hierarchy structure') parser.add_argument('-uc', '--upper-case', dest='upperCase', action='store_true', default=False, help='Boolean - If set, all words will be converted to upper-case') parser.add_argument('-o', '--output-file', type=str, dest='output_file', default=OUTPUT_FILE, help="Directory to be used to save the created master set. Filename will be automatically created based on input flags. Defaults to something needsto go here") args = parser.parse_args() data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-1") #set column names? words = pandas.DataFrame({'type':data_in.type_class, 'class': data_in.source_class, 'description': data_in.description}) #set column names? #print(words.head()) #============================================================================= # specify data frame of the required length #============================================================================= #============================================================================== # # frame_len = 0 # #for x in range(1, len(words)): # # # temp_len = len(words.description[x].split()) # # if temp_len > frame_len: # frame_len = temp_len # print(frame_len) # print(words.description[x].split()) # #============================================================================== processed_data = pandas.DataFrame(data_in[['type_class', 'source_class']]) # function calls need to be edited to send the relevant columns (excluding type and source). # it will require a restructuring of data types. Not sure if can use a dynamic data frame # as don't know how many words in each row - also will be different depending on processing #tokenize the text either straight or keeping only alpha-numeric(default) if args.alphaNumeric: # keep just the alpha-numeric characters from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') word_list = list(map(tokenizer.tokenize, words.description)) #print("alpha numeric only") #print(word_list[1:20]) # else: # word_list = list(map(nltk.word_tokenize, words.description)) #print(word_list[1:20]) if args.tokenHyphen: #print("STEMATIZE") word_list = list(map(text_processing.tokenize_on_hyphen, words.description)) #print(word_list[1:20]) # lower case if args.lowerCase: #print("LOWER CASE") word_list = list(map(text_processing.make_lower, word_list)) #print(word_list[1:20]) # Upper case if args.upperCase: #print("UPPER CASE") word_list = list(map(text_processing.make_upper, word_list)) #print(word_list[1:20]) if args.lemmatizeall: #print("LEMMATIZE ALL") word_list = list(map(text_processing.lemmatizeall, word_list)) #print(word_list[1:20]) if args.lemmatize: #print("LEMMATIZE") word_list = list(map(text_processing.lemmatize, word_list)) #print(word_list[1:20]) if args.removeWords: print("REMOVE WORDS") #needs function in text_processing if args.stematize: #print("STEMATIZE") word_list = list(map(text_processing.stematize, word_list)) #print(word_list[1:20]) if args.stopWords: #print("STOP WORDS") word_list = list(map(text_processing.exclude_stop_words, word_list)) #print(word_list[1:20]) wl_df = pandas.DataFrame(word_list) frames = [processed_data, wl_df] output_df = pandas.concat(frames, axis = 1) #print(output_df[1:20]) helpers.ensure_dir(os.path.dirname(args.output_file)) output_df.to_csv(args.output_file, index = False) # write to file, but don't give row names
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_FILE = os.path.join("..", "..", "data", "features", "data_frame.csv") OUTPUT_DIR = os.path.join("..", "..", "data", "classifiers", "decision_trees") FOLD_FILE = os.path.join("..", "..", "data", "folds", "iTrain.csv") INPUT_MS = os.path.join("..", "..", "data", "features", "master_set.csv") parser.add_argument('-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help="File with features. The classes will be taken from this file. Defaults to '%(default)s'") parser.add_argument('--ms', '--master_set', dest='master_set', default=INPUT_MS, help="The input file containing the master set. Defaults to '%(default)s'") parser.add_argument('--fn', '--fold-number', dest='fold_number', default=None, type=int, help="The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered. If None, will train the classifier for all the folds in a loop. Defaults to '%(default)s'") parser.add_argument('--ff', '--fold-file', type=str, dest='foldFile', default=FOLD_FILE, help="Fold file containing the cross-fold validation indices. Defaults to '%(default)s'") parser.add_argument('-o', '--output-dir', type=str, dest='output_dir', default=OUTPUT_DIR, help="Directory to save the decision trees. Defaults to '%(default)s'") args = parser.parse_args() type_dict = {'IV':1, 'IG':2, 'IC':3, 'I':4, 'IGI':5, 'IO':6} # read the input matrix and the labels data_in = pandas.read_csv(args.input_file, encoding="ISO-8859-1") master_in = pandas.read_csv(args.master_set, encoding="ISO-8859-1", header=-1) labels_orig=data_in.type_class labels=pandas.Series([type_dict[x] for x in labels_orig]) # Create output directory helpers.ensure_dir(args.output_dir) dtree = tree.DecisionTreeClassifier(random_state=0, max_depth=100, criterion='entropy') if args.fold_number == None: # loop over all folds and create the classifier print("Training will be done iteratively for all folds...\n") cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0] for fold in cross_fold_indices.unique(): print("Training classifier for fold %d...\n" % fold) data_fold = data_in.loc[cross_fold_indices != fold,:] import ipdb; ipdb.set_trace() data_matrix = data_manipulation.binary_sparse_matrix(data_fold, master_in) labels_fold = labels[cross_fold_indices != fold] dtree.fit(data_matrix, labels_fold) joblib.dump(dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % fold)) tree.export_graphviz(dtree, out_file= os.path.join(args.output_dir, 'tree-fold%d.dot' % fold), max_depth=5, feature_names = master_in.values) elif args.fold_number == 0: # use all the data to train the classifier print("Training classifier for the full set...\n" % fold) data_matrix = data_manipulation.binary_sparse_matrix(data_in, master_in) dtree.fit(data_matrix, labels) joblib.dump(dtree, os.path.join(args.output_dir, 'tree.pkl')) else: # create classifier for a particular fold print("Training classifier for fold %d...\n" % fold) cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0] data_fold = data_in.loc[cross_fold_indices != args.fold_number] data_matrix = data_manipulation.binary_sparse_matrix(data_fold, master_in) labels = labels[cross_fold_indices != args.fold_number] dtree.fit(data_matrix, labels) joblib.dump(dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % args.fold_number))
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) INPUT_FILE = os.path.join("..", "..", "data", "features", "bow_string_input_dframe.p") OUTPUT_FILE_DESC = os.path.join("..", "..", "data", "output", 'predicted_labels_ensemble.csv') parser.add_argument( '-i', '--input-file', type=str, dest='input_file', default=INPUT_FILE, help= "File with the list of item classes and features. Defaults to '%(default)s'" ) parser.add_argument( '-g', '--gridres-file', type=str, dest='gridres_file', default=None, help= "File with the best parameters of the grid search. Defaults to '%(default)s'" ) parser.add_argument( '-n', '-ngrams', dest='nGrams', type=int, default=1, nargs='+', help= 'Defines how to split words by ngrams. Default is tokenized to one word ngrams' ) parser.add_argument('--ti', dest='tf_idf', action='store_true', default=False, help='Boolean - If set, TfIdf features will be used') parser.add_argument( '-c', '--classifier', type=str, dest='classifier', default="ensemble", help="The classifier to be used. Defaults to '%(default)s'") parser.add_argument( '--cat', type=str, dest='category', default='income-type', choices=('income-type', 'income-source', 'expenditure-type'), help="The type of categorization. Defaults to '%(default)s'") parser.add_argument( '--od', '--output-file-desc', type=str, dest='output_file_desc', default=OUTPUT_FILE_DESC, help= "A csv file to output the predicted labels. Defaults to '%(default)s'") args = parser.parse_args() # Read the input dictionary data_in = pandas.read_pickle(args.input_file) type_classes = list(data_in['type_class']) source_classes = list(data_in['source_class']) frID = list(data_in['frID']) data_orig = data_in['description'] if args.category == 'income-type' or args.category == 'expenditure-type': labels_orig = [ str(i) for i in type_classes ] # converting them to strings if they are not strings already else: labels_orig = [ str(i) for i in source_classes ] # converting them to strings if they are not strings already if args.gridres_file is not None: gridres_params = pickle.load(open(args.gridres_file, "rb"))[0] else: gridres_params = None if args.classifier == 'decision-tree': max_depth = gridres_params[ 'clf__max_depth'] if gridres_params is not None else 100 clf = tree.DecisionTreeClassifier(max_depth=max_depth, criterion='entropy') print("Will run a decision tree with max depth=%d" % max_depth) elif args.classifier == 'random-forest': max_depth = gridres_params[ 'clf__max_depth'] if gridres_params is not None else 100 clf = ensemble.RandomForestClassifier(max_depth=max_depth, criterion='entropy') print("Will run a random forest with max depth=%d" % max_depth) elif args.classifier == 'logistic-regression': C = gridres_params['clf__C'] if gridres_params is not None else 1 clf = linear_model.LogisticRegression(C=C) print("Will run logistic regressor with C=%d" % C) else: # ensemble clf1 = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy') clf2 = ensemble.RandomForestClassifier() clf3 = linear_model.LogisticRegression() clf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='hard') vectorizer = feature_extraction.text.CountVectorizer( analyzer='word', #whether should be made ofword or char n-grams binary= False, # if True all non-zero counts are set to one - used for probabilistic mapping decode_error= 'strict', # Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding #dtype='numpy.int64', # Type of the matrix returned by fit_transform() or transform() encoding="ISO-8859-15", # input='content', # can be 'file', 'filename' or 'content' lowercase= False, #Convert all characters to lowercase before tokenizing. max_df= 1.0, # When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None." max_features= None, # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None. ngram_range=( 1, args.nGrams ), # The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. preprocessor= None, # Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. stop_words=None, # min_df=1, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None) if args.tf_idf == True: transformer = feature_extraction.text.TfidfTransformer() ppl = pipeline.Pipeline([ ('vectorizer', vectorizer), ('transformer', transformer), ('clf', clf), ]) else: ppl = pipeline.Pipeline([ ('vectorizer', vectorizer), ('clf', clf), ]) k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True) labels_predicted = [-1] * len(labels_orig) accuracy = [] for train_idx, dev_idx in k_fold: data_train = [data_orig[i] for i in train_idx] data_dev = [data_orig[i] for i in dev_idx] labels_train = [labels_orig[i] for i in train_idx] labels_dev = [labels_orig[i] for i in dev_idx] ppl.fit(data_train, labels_train) predicted_dev = ppl.predict(data_dev) labels_predicted = set_all_predicted(predicted_dev, labels_predicted, dev_idx) accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)] print("Accuracy of the %s classifier: %.4f +- %.4f" % (args.classifier, numpy.mean(accuracy), numpy.std(accuracy))) # Save the predicted classes to_dump = [labels_orig, labels_predicted] helpers.ensure_dir(os.path.dirname(args.output_file_desc)) #create a dataframe to output type class, predicted type class and description data if args.category == 'income-type' or args.category == 'expenditure-type': dump_op_desc = pandas.DataFrame({ 'frID': frID, 'type_class': labels_orig, 'type_class_predicted': labels_predicted, 'description': data_orig }) else: dump_op_desc = pandas.DataFrame({ 'frID': frID, 'source_class': labels_orig, 'source_class_predicted': labels_predicted, 'description': data_orig }) dump_op_desc.to_csv(args.output_file_desc)
def main(): # to parse the arguments that are passed to main parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # this is the default input directory if nothing is passed INPUT_MS = os.path.join("..", "..", "data", "features", "master_set.csv") INPUT_DF = os.path.join("..", "..", "data", "features", "data_frame.csv") OUTPUT_FILE = os.path.join("..", "..", "data", "features", "sparse_matrix.pkl") parser.add_argument('--im', '--master-set-file', type=str, dest='inputMS', default=INPUT_MS, help='Master set of all words to be processed to boolean matrix') parser.add_argument('--id', '--data-frame-file', type=str, dest='inputDF', default=INPUT_DF, help='Data frame processed by "create_data_frame"') parser.add_argument('-o', '--output-file', type=str, dest='output_file', default=OUTPUT_FILE, help="File to save the sparse matrix. Defaults to '%(default)s'") args = parser.parse_args() master_in = pandas.read_csv(args.inputMS, encoding="ISO-8859-1", header=-1) data_in = pandas.read_csv(args.inputDF, encoding="ISO-8859-1") numrows = len(data_in) numcols = len(master_in) rowind = [] colind = [] data=[] for i in range(len(data_in[:3])): thisrow = data_in.iloc[i,2:] valid = thisrow.dropna() validset=set(list(valid)) for setelem in validset: master_set_ind = master_in.loc[master_in[0]==setelem][0].index[0] rowind.append(i) colind.append(master_set_ind) data.append(1) import ipdb; ipdb.set_trace() sparse_mat = scipy.sparse.coo_matrix((numpy.array(data),(numpy.array(rowind), numpy.array(colind))), shape=(numrows, numcols)) # save the matrix helpers.ensure_dir(os.path.dirname(args.output_file)) joblib.dump(sparse_mat, args.output_file) # len(df_in.columns) # df_in.iloc(5) ipdb.set_trace() # for loop here through the rows for x in range(0, len(df_in)): # for loop through cells in row - starting from 3rd? - where master set words start for y in range(2, len(df_in.columns)): print('x', x) print('y', y) token = df_in.iloc[x,y] #token = 'f**k' this_cell = [ 1 if token in master_set else 0]# for w in token ] #df_in.iloc[:1] #access row one if this_cell == 1: master_cols.ix[x, token] = 1