def post_dump(self, *args, **kwargs): gunzipall(self.new_data_folder) self.logger.info("Merging files") FREQ = os.path.join(self.new_data_folder,"meddra_freq.tsv") ALL_SE = os.path.join(self.new_data_folder,"meddra_all_se.tsv") ALL_INDICATIONS = os.path.join(self.new_data_folder,"meddra_all_indications.tsv") MERGED = os.path.join(self.new_data_folder,"merged_freq_all_se_indications.tsv") #merge first two files- side effect and side effect with frequency #add header to csv files df1 = pd.read_csv(FREQ, delimiter='\t') df1.columns = ['stitch_id(flat)','stitch_id(stereo)','umls_id(label)','is_placebo', 'desc_type','lower','upper','meddra_type','umls_id(meddra)','se_name'] df2 = pd.read_csv(ALL_SE, delimiter='\t') df2.columns = ['stitch_id(flat)','stitch_id(stereo)','umls_id(label)','meddra_type', 'umls_id(meddra)','se_name'] s1 = pd.merge(df1, df2, how='outer',on=['stitch_id(flat)','stitch_id(stereo)','umls_id(label)','meddra_type','umls_id(meddra)','se_name']) #merge above merged file with indication file df4 = pd.read_csv(ALL_INDICATIONS,delimiter='\t') df4.columns =['stitch_id(flat)','umls_id(label)','method_of_detection','concept_name', 'meddra_type','umls_id(meddra)','concept_name(meddra)'] s2 = pd.merge(s1,df4,how='outer',on=['stitch_id(flat)','umls_id(label)','meddra_type','umls_id(meddra)']) s3 = s2.sort('stitch_id(flat)') s3.to_csv(MERGED) self.logger.info("Files successfully merged, ready to be uploaded")
def post_dump(self, *args, **kwargs): self.logger.info("Unzipping files in '%s'" % self.new_data_folder) gunzipall(self.new_data_folder) input_file = os.path.join(self.new_data_folder, "mvi_ca") self.logger.info("Split file in chunks") subprocess.check_call([ "split", "-l", "%s" % self.__class__.CHUNK_SIZE, input_file, "%s.split." % input_file ])
def post_dump(self, *args, **kwargs): gunzipall(self.new_data_folder) self.logger.info("Merging files") FREQ = os.path.join(self.new_data_folder, "meddra_freq.tsv") ALL_SE = os.path.join(self.new_data_folder, "meddra_all_se.tsv") ALL_INDICATIONS = os.path.join(self.new_data_folder, "meddra_all_indications.tsv") MERGED = os.path.join(self.new_data_folder, "merged_freq_all_se_indications.tsv") #merge first two files- side effect and side effect with frequency #add header to csv files df1 = pd.read_csv(FREQ, delimiter='\t') df1.columns = [ 'stitch_id(flat)', 'stitch_id(stereo)', 'umls_id(label)', 'is_placebo', 'desc_type', 'lower', 'upper', 'meddra_type', 'umls_id(meddra)', 'se_name' ] df2 = pd.read_csv(ALL_SE, delimiter='\t') df2.columns = [ 'stitch_id(flat)', 'stitch_id(stereo)', 'umls_id(label)', 'meddra_type', 'umls_id(meddra)', 'se_name' ] s1 = pd.merge(df1, df2, how='outer', on=[ 'stitch_id(flat)', 'stitch_id(stereo)', 'umls_id(label)', 'meddra_type', 'umls_id(meddra)', 'se_name' ]) #merge above merged file with indication file df4 = pd.read_csv(ALL_INDICATIONS, delimiter='\t') df4.columns = [ 'stitch_id(flat)', 'umls_id(label)', 'method_of_detection', 'concept_name', 'meddra_type', 'umls_id(meddra)', 'concept_name(meddra)' ] s2 = pd.merge(s1, df4, how='outer', on=[ 'stitch_id(flat)', 'umls_id(label)', 'meddra_type', 'umls_id(meddra)' ]) s3 = s2.sort_values('stitch_id(flat)') s3.to_csv(MERGED) self.logger.info("Files successfully merged, ready to be uploaded")
def post_dump(self): self.logger.info("Uncompressing files in '%s'" % self.new_data_folder) gunzipall(self.new_data_folder)
def post_dump(self): gunzipall(self.new_data_folder)
def post_dump(self, *args, **kwargs): gunzipall(self.new_data_folder)