def _load_college_map(): ret = {} for line in utils.read_file(College.college_map_file): key, value = line.split("=") assert value in STD_COLLEGE ret[key] = value return ret
def find_image_depth_matching(scene): pairs = uts.read_file(DATA_PATH + scene + '/id_img2depth.txt') id_img2depth = {} for pair in pairs: image_name, depth_name = pair.split(' ') id_img2depth[image_name] = depth_name return id_img2depth
def run_process(q_recv, q_send, in_folder, out_folder, failed_extractions_file, max_tries, use_diffbot): """ Tries 'max_tries' times to extract text using At the end, if using diffbot, tries one last time with boilerpipe """ texts, trec_ids = [], [] def retrieve_texts_from_html(html, use_diffbot=False): """ Use the Diffbot API/Boilerpipe to retrieve texts from HTML """ if use_diffbot: dummy_url = 'https://www.diffbot.com/dev/analytics/' url_api = "https://api.diffbot.com/v3/article?token=%s" \ "&discussion=false&url=%s" % (DIFFBOT_TOKEN, dummy_url) headers = {'Content-type': 'text/html'} content = json.loads( requests.post(url_api, data=html, headers=headers).text) text = content["objects"][0]["text"] title = content["objects"][0]["title"] text = '\n'.join([title, text]) else: text = Extractor(extractor='ArticleExtractor', html=html).getText() return text while True: trec_id = q_recv.get() # Check end condition if trec_id is None: break # Check if file exists if not os.path.isfile("%s/%s" % (in_folder, trec_id)): continue # Read HTML html = read_file("%s/%s" % (in_folder, trec_id), encoding='latin1') i = 0 while i != max_tries: try: texts.append( retrieve_texts_from_html(html, use_diffbot=use_diffbot)) trec_ids.append(trec_id) break except Exception as e: # Extraction failed # print(e) i += 1 if i == max_tries: write_file("%s\n" % trec_id, failed_extractions_file, 'a') q_send.put((texts, trec_ids))
def read_sample_dir(root_dir): try: file_path = root_dir + "/" + "sample_name_dir.txt" samples = [] samples = utils.read_file(file_path, "r") return (samples) except Exception as e: print("Error occurred while reading the file : {} \n {} ".format( dir, e))
def process(q_recv, q_send, corpus_folder, remove_stopwords): articles = [] while True: cwid = q_recv.get() # Check end condition if cwid is None: break # Article is already encoded in UTF-8 article = read_file("%s/%s" % (corpus_folder, cwid)) # PREPROCESS articles += [preprocess_text(article, tokenize=True, all_lower=True, stopw=remove_stopwords).split()] # Send info back q_send.put((articles))
def main(argv): argv = FLAGS(argv) # parse argv to FLAG scene_names = uts.read_file(DATA_PATH + 'test_sun3d.txt') scene_names = scene_names[FLAGS.start:FLAGS.end] for scene_name in scene_names: scene_name += '/' print FLOW_PATH + scene_name + 'flow/' # down load file if not os.path.exists(DATA_PATH + scene_name + 'id_img2depth.txt'): print 'Retrieve Data' os.system("/home/peng/SUN3DCppReader/src/build/SUN3DCppReader " \ + scene_name + " " + DATA_PATH) if not os.path.exists(DATA_PATH + scene_name + '/id_img2depth.txt'): continue if not os.path.exists(FLOW_PATH + scene_name + '/flow/'): uts.mkdir_if_need(FLOW_PATH + scene_name + '/flow/') id_img2depth = find_image_depth_matching(scene_name) gen_img_pair_data(scene_name, 500, id_img2depth)
# %% sheet_name = "Holiday_Package.csv" target_variable = 'Holliday_Package' modal = [['Decision_tree', 10], ['random_forest', 5], ['neural_network', 10], ['logistic_regression', 3], ['lda', 10], ['knn', 5], ['naive_bayes', 3, 2, 'f1'], ['bagging', 3, 2, 'f1'], ['ada_boost', 3, 2, 'f1'], ['gradient_boosting', 3, 2, 'f1'], ['support_vector_machine', 3]] modals_data = pd.DataFrame( modal, columns=['Modal', 'cross_validation', 'n_jobs', 'scoring']) # %% print(modals_data) # %% df_holiday_package = utils.read_file(sheet_name) display(df_holiday_package) # %% df_holiday_package = utils.drop_columns_by_colnums(df_holiday_package, [0]) display(df_holiday_package) # %% utils.info(df_holiday_package) # %% categorical_columns = utils.get_categorical_cols(df_holiday_package) print(categorical_columns) # %% continuos_columns = utils.get_continous_cols(df_holiday_package)
# To add a new cell, type '# %%' # To add a new markdown cell, type '# %% [markdown]' # %% import pandas as pd import numpy as np import sys import os import traceback from IPython.display import display from utils import utils from bivariate_analysis import bivariate_analysis # %% df_gems = utils.read_file("cubic_zirconia.csv") display(df_gems) # %% df_gems = utils.drop_columns_by_colnums(df_gems, [0]) display(df_gems) # %% categorical_cols = utils.get_categorical_cols(df_gems) display(categorical_cols) # %% continous_cols = utils.get_continous_cols(df_gems) display(continous_cols) # %% utils.describe(df_gems, continous_cols)
def _load_std_college(): return set(list(utils.read_file(College.std_college_file)))
import sys from utils.sql_ops import Sqlite from utils import utils field = [ "id", "name", "name_ruby", "department_1", "department_2", "likes", "dislikes", "like_foods", "dislike_foods", "introduction" ] types = [ "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT" ] primary = "id" def insert_chara(data): chara = Sqlite("starlightRe.db") chara.create("chara", field, types, primary) chara.insert("chara", data) chara.close() if __name__ == "__main__": data = utils.read_file(sys.argv[1], field) print(data[0]) insert_chara(data)
#file path that contains the list of the samples to be analyzed. if mv_input.__contains__("No"): print( "Please provide a path to the .txt file containing list of the samples to be analyzed." ) file_input = input() file_check = path.exists(file_input) while file_check is False: print( "Please provide a valid path to the file containing list of the samples to be analyzed." ) file_input = input() file_check = path.exists(file_input) samples = [] if file_check is True: samples = utils.read_file(file_input, "r") if len(samples) > 0: for s in samples: run_bam_from_fastq.bam_from_fastq(rootDir, s) haplotypecaller.run_haplotypecaller(rootDir) combine_gvcfs.run_combine_gvcfs(rootDir) genotype_gvcfs.run_genotype_vcfs(rootDir) variant_calibration.run_variant_calibration_method(rootDir)
import os import traceback from IPython.display import display from utils import utils from modals import modals from bivariate_analysis import bivariate_analysis sheet_name="insurance_claim.csv" target_variable='Claimed' modal=[['Decision_tree',10],['Random_forest',5,-1],['Neural_Network',10,-1], ['Logistic_Regression',3,-1],['LDA',10,-1],['KNN',5,-1],['Naive_Bayes',3,-1,'f1'],['Bagging',3,-1,'f1'], ['Ada_Boost',3,-1,'f1'],['Gradient_Boosting',3,-1,'f1'],['Support_Vector_Machine',3,-1]] modals_data = pd.DataFrame(modal,columns=['Modal','cross_validation','n_jobs','scoring']) display(modals_data) display(modals_data.iloc[0]) df_insuarnce_claim = utils.read_file(sheet_name) display(df_insuarnce_claim) #df_insuarnce_claim=utils.drop_columns_by_colnums(df_insuarnce_claim,[0]) #display(df_insuarnce_claim) utils.check_duplicate_data(df_insuarnce_claim) categorical_columns=utils.get_categorical_cols(df_insuarnce_claim) continous_columns=utils.get_continous_cols(df_insuarnce_claim) utils.convert_cat_into_code(df_insuarnce_claim,categorical_columns) modals.check_imbalance(df_insuarnce_claim[target_variable]) X,Y=modals.extract_target_column(df_insuarnce_claim,target_variable) X_train, X_test, Y_train, Y_test = modals.split_data_into_train_test(X,Y,0.30,1)