def join() -> None: d = dict() event_found = 0 event_unknown_added = 0 event_discarded = 0 with TsvReader(ConfigJoin.hash_file) as hash_file_handle: if ConfigProgress.progress: hash_file_handle = tqdm.tqdm(hash_file_handle, desc="reading hash") for fields in hash_file_handle: key = fields[ConfigJoin.hash_key_column] value = fields[ConfigJoin.hash_value_column] d[key] = value with TsvReader(ConfigInputFile.input_file) as input_file_handle, \ TsvWriter(ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm( input_file_handle, desc="reading input and writing output") for fields in input_file_handle: key = fields[ConfigJoin.input_key_column] if key in d: event_found += 1 new_value = d[key] fields.insert(ConfigJoin.output_insert_column, new_value) output_file_handle.write(fields) else: if ConfigJoin.output_add_unknown: event_unknown_added += 1 fields.insert(ConfigJoin.output_insert_column, "unknown") output_file_handle.write(fields) else: event_discarded += 1 print("event_found {}".format(event_found)) print("event_unknown_added {}".format(event_unknown_added)) print("event_discarded {}".format(event_discarded))
def majority() -> None: """ This means that if x1 appears more with y2 than any other values in column Y then x1, y2 will be in the output and no other entry with x1 will appear """ d: Dict[Dict[str, int]] = defaultdict(dict) with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: p_first = fields[ConfigMajority.input_first_column] p_second = fields[ConfigMajority.input_second_column] p_multiplication = int( fields[ConfigMajority.input_multiplication_column]) if p_second not in d[p_first]: d[p_first][p_second] = 0 d[p_first][p_second] += p_multiplication with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: for p_first, p_dict in d.items(): p_second = max(p_dict.keys(), key=lambda x, closure_dict=p_dict: closure_dict[x]) p_count = p_dict[p_second] output_file_handle.write([ p_first, p_second, str(p_count), ])
def sample_by_column_old() -> None: weights = [] elements = [] sum_weights = float(0) with TsvReader(ConfigInputFile.input_file) as input_handle: if ConfigProgress.progress: input_handle = tqdm.tqdm(input_handle) for fields in input_handle: elements.append(fields) weight = float(fields[ConfigSampleColumn.sample_column]) sum_weights += weight weights.append(weight) # the following code will only work on python3.6 because the # random.choices API was only introduced then # from random import choices # results = choices(lines, weights, k=size) # this is the same code with numpy weights = [w / sum_weights for w in weights] if ConfigSampleByColumnOld.hits_mode: results_dict = defaultdict(int) for _ in range(ConfigSampleSize.size): current_result = numpy.random.choice( a=len(elements), replace=ConfigReplace.replace, size=1, p=weights, ) current_result = current_result[0] results_dict[current_result] += 1 with TsvWriter(ConfigOutputFile.output_file) as output_handle: for result, hits in results_dict.items(): record = list(elements[result]) record.append(hits) output_handle.write(record) else: results = numpy.random.choice( a=len(elements), replace=ConfigReplace.replace, size=ConfigSampleSize.size, p=weights, ) with TsvWriter(ConfigOutputFile.output_file) as output_handle: for result in results: output_handle.write(elements[result])
def cut() -> None: with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: out_fields = [fields[x] for x in ConfigColumns.columns] output_file_handle.write(out_fields)
def lc() -> None: with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: for i in ConfigColumns.columns: fields[i] = fields[i].lower() output_file_handle.write(fields)
def multiply() -> None: with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: current_number = int(fields[ConfigColumn.column]) for _ in range(current_number): output_file_handle.write(fields)
def remove_quotes() -> None: with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: for i in ConfigColumns.columns: if fields[i].startswith("\"") and fields[i].endswith( "\"") and len(fields[i]) > 1: fields[i] = fields[i][1:-1] output_file_handle.write(fields)
def clean_by_field_num() -> None: with TsvReader(filename=ConfigInputFile.input_file, validate_all_lines_same_number_of_fields=False ) as input_file_handle: with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle, desc=ConfigInputFile.input_file) for fields in input_file_handle: if len(fields) == ConfigColumns.columns: output_file_handle.write(fields)
def process_single_file(job_info: JobInfo) -> JobReturnValue: logger = logging.getLogger(__name__) tsv_writers_dict = dict() results = dict() with TsvReader( filename=job_info.input_file, check_non_ascii=job_info.check_not_ascii) as input_file_handle: if job_info.progress: logger.info("working on [{job_info.input_file}]") input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: key = ",".join([fields[x] for x in job_info.columns]) if key not in tsv_writers_dict: filename = job_info.pattern.format(key=key, i=job_info.serial) results[key] = filename output_handle = TsvWriter(filename=filename) tsv_writers_dict[key] = output_handle output_handle = tsv_writers_dict[key] output_handle.write(fields) for v in tsv_writers_dict.values(): v.close() return JobReturnValue(job_info.serial, results)
def split_by_columns() -> None: pylogconf.core.setup() logger = logging.getLogger(__name__) assert len(ConfigColumns.columns) > 0, "must provide --columns" tsv_writers_dict = dict() for input_file in ConfigInputFiles.input_files: with TsvReader(filename=input_file, check_non_ascii=ConfigTsvReader.check_non_ascii ) as input_file_handle: if ConfigProgress.progress: logger.info(f"working on [{input_file}]") input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: key = ",".join([fields[x] for x in ConfigColumns.columns]) if key not in tsv_writers_dict: filename = ConfigPattern.pattern.format(key=key) output_handle = TsvWriter(filename=filename) tsv_writers_dict[key] = output_handle output_handle = tsv_writers_dict[key] output_handle.write(fields) # close all writers for v in tsv_writers_dict.values(): v.close()
def csv_to_tsv() -> None: if ConfigCsvToTsv.set_max: csv.field_size_limit(sys.maxsize) with pyanyzip.core.openzip(ConfigInputFile.input_file, "rt") as input_file_handle: csv_reader = csv.reader(input_file_handle) with TsvWriter( filename=ConfigOutputFile.output_file, check_num_fields=ConfigCsvToTsv.check_num_fields, ) as output_file_handle: for row in csv_reader: if ConfigCsvToTsv.replace_tabs_with_spaces: for i, _item in enumerate(row): row[i] = row[i].replace("\t", " ") output_file_handle.write(row)
def drop_duplicates_by_columns() -> None: with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) saw = set() with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: for fields in input_file_handle: match = frozenset([ fields[match_column] for match_column in ConfigColumns.columns ]) if match not in saw: saw.add(match) output_file_handle.write(fields)
def fix_columns() -> None: # We need to read the input file WITHOUT assuming that it hasn't problems with TsvReader( filename=ConfigInputFile.input_file, check_non_ascii=ConfigTsvReader.check_non_ascii, ) as input_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: for fields in input_file_handle: for fix_column in ConfigColumns.columns: fields[fix_column] = clean( text=fields[fix_column], clean_edges=ConfigFixTypes.clean_edges, sub_trailing=ConfigFixTypes.sub_trailing, remove_non_ascii=ConfigFixTypes.remove_non_ascii, lower_case=ConfigFixTypes.lower_case, ) output_file_handle.write(fields)
def histogram_by_column() -> None: a = [] total = 0 with TsvReader(ConfigInputFile.input_file) as input_handle: for fields in input_handle: a.append(float(fields[ConfigColumn.column])) total += 1 count_in_bucket, bucket_edges = numpy.histogram( a, bins=ConfigBucketNumber.bucket_number) with TsvWriter(ConfigOutputFile.output_file) as output_handle: current_sum = 0 for i, count in enumerate(count_in_bucket): current_sum += count edge_from = bucket_edges[i] edge_to = bucket_edges[i + 1] output_handle.write([ str(edge_from), str(edge_to), str(count), str(int(100.0 * current_sum / total)), ])