def main(): # Parse the original dataset print("Parsing of original data...") parse.main(original_file, parsed_file, prefix) # Create features from the dataset print("Creating additional features...") add_features.main(parsed_file, rain_dataset, added_features_file) # Merge the three files : parsed_file, added_features_file and # distances_features_file into merged_file print("Merging all features...") parsed_df = pd.read_csv(parsed_file) added_features_df = pd.read_csv(added_features_file) distances_features_df = pd.read_csv(distances_features_file) # how="left" replace missing ORSM values by NaNs (dunno how to replace by zeros) merged_df = pd.merge(parsed_df, pd.merge(added_features_df, distances_features_df, on='id', how='left'), on='id') merged_df.to_csv(merged_file, index=False) # Do not clean the test set if prefix == "train": # Clean the final dataset print("Cleaning data...") clean.main(merged_file, merged_file)
def main(sets, path='data/'): print('Combining raw data sets...') combine_sources.main(sets, path) print('Cleaning string for Lemmatization and Tagging...') clean.main(path) print('Splitting the data for Lemmatization and Tagging...') pos_prep.main(path) print('Creating Lemmas and Pos Tags using RFTagger...') run([ 'java', '-jar', 'C:\\Users\\Josef\\PycharmProjects\\QC-Yes-No\\Classification\\preprocessing\\RFTagger\\rft-java.jar', '-c', 'stts', '-l', 'german', '-x', 'C:\\Users\\Josef\\PycharmProjects\\QC-Yes-No\\Classification\\preprocessing\\RFTagger\\lib\\german-rft-tagger-lemma-lexicon.txt', 'C:\\Users\\Josef\\PycharmProjects\\QC-Yes-No\\Classification\\preprocessing\\RFTagger\\lib\\german.par', path + 'all_pre_pos.txt', path + 'pos_tagged_raw.txt' ], shell=True) print('Further streamlining data...') reclean.main(path) print('Extracting final data set...') extract_info_from_tag.main(path) print('Process finished.')
def main(args): """ Main running script """ # Get the config file config = util.get_config(args.config) root_dir = config['ROOT_DIR'] # fill out initial folders if not os.path.isdir('{}/metadata'.format(root_dir)): os.mkdir('{}/metadata'.format(root_dir)) print('created metadata dir') if not os.path.isdir('{}'.format(config['OBS_ROOT'])): os.mkdir('{}'.format(config['OBS_ROOT'])) print('created OBS dir') if not os.path.isdir('{}'.format(config['ESTIMATORS_ROOT'])): os.mkdir('{}'.format(config['ESTIMATORS_ROOT'])) print('created ESTIMATORS dir') if not os.path.isdir('{}'.format(config['PREDICTIONS_ROOT'])): os.mkdir('{}'.format(config['PREDICTIONS_ROOT'])) print('created PREDICTIONS dir') if not os.path.isdir('{}'.format(config['QAQC_ROOT'])): os.mkdir('{}'.format(config['QAQC_ROOT'])) print('created QAQC dir') if not os.path.isdir('{}'.format(config['PLOT_ROOT'])): os.mkdir('{}'.format(config['PLOT_ROOT'])) print('created PLOT dir') # --- download data --- if args.clean: clean.main(config) else: print('skipping database cleaning') # --- download data --- if args.download: download.main(config) else: print('skipping download of new data') # --- train models if args.train: train.main(config) else: print('skip training') # --- make predictions --- if args.predict: predict.main(config) else: print('skipping download of new data') # --- run qaqc checks --- if args.qaqc: qaqc.main(config) else: print('skipping qaqc') # --- plot --- if args.plot: plot.main(config) else: print('skipping plots')
def main(): # Clean, then build. Couldn't be simpler! error = clean.main() if error!=0: return error return build.main()
def main(): delete_data_dir_contents() scrap() clean.main()
import Predictive_Analysis_Classification import clean import Clustering_Associationrules import t_test if __name__ == '__main__': # Basic Statistical Analysis and data cleaning insight Part print("##############################################") print("# Basic Statistical Analysis and data cleaning insight Part") print("##############################################") clean.main() # Cluster and Association Rule Part print("##############################################") print("# Cluster and Association Rule Part") print("##############################################") Clustering_Associationrules.execute() # Predictive_Analysis_Hypothesis_Test Part print("##############################################") print("# Predictive_Analysis_Hypothesis_Test Part") print("##############################################") t_test.execute() # Predictive_Analysis_Classification Part print("##############################################") print("# Predictive_Analysis_Classification Part") print("##############################################") Predictive_Analysis_Classification.execute()
def main(args): args.files = filter(is_arxiv, catalogue.utils.list_files(".pdf")) clean.main(args)