def join() -> None: d = dict() event_found = 0 event_unknown_added = 0 event_discarded = 0 with TsvReader(ConfigJoin.hash_file) as hash_file_handle: if ConfigProgress.progress: hash_file_handle = tqdm.tqdm(hash_file_handle, desc="reading hash") for fields in hash_file_handle: key = fields[ConfigJoin.hash_key_column] value = fields[ConfigJoin.hash_value_column] d[key] = value with TsvReader(ConfigInputFile.input_file) as input_file_handle, \ TsvWriter(ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm( input_file_handle, desc="reading input and writing output") for fields in input_file_handle: key = fields[ConfigJoin.input_key_column] if key in d: event_found += 1 new_value = d[key] fields.insert(ConfigJoin.output_insert_column, new_value) output_file_handle.write(fields) else: if ConfigJoin.output_add_unknown: event_unknown_added += 1 fields.insert(ConfigJoin.output_insert_column, "unknown") output_file_handle.write(fields) else: event_discarded += 1 print("event_found {}".format(event_found)) print("event_unknown_added {}".format(event_unknown_added)) print("event_discarded {}".format(event_discarded))
def majority() -> None: """ This means that if x1 appears more with y2 than any other values in column Y then x1, y2 will be in the output and no other entry with x1 will appear """ d: Dict[Dict[str, int]] = defaultdict(dict) with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: p_first = fields[ConfigMajority.input_first_column] p_second = fields[ConfigMajority.input_second_column] p_multiplication = int( fields[ConfigMajority.input_multiplication_column]) if p_second not in d[p_first]: d[p_first][p_second] = 0 d[p_first][p_second] += p_multiplication with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: for p_first, p_dict in d.items(): p_second = max(p_dict.keys(), key=lambda x, closure_dict=p_dict: closure_dict[x]) p_count = p_dict[p_second] output_file_handle.write([ p_first, p_second, str(p_count), ])
def check() -> None: """ TODO: - add ability to say how many lines are bad and print their content """ if ConfigParallel.parallel: with concurrent.futures.ProcessPoolExecutor( max_workers=ConfigParallel.jobs) as executor: job_list = [] for input_file in ConfigInputFiles.input_files: job = ParamsForJob() job.progress = ConfigProgress.progress job.check_non_ascii = ConfigTsvReader.check_non_ascii job.num_fields = ConfigNumFields.num_fields job.input_file = input_file job.validate_all_lines_same_number_of_fields = ConfigTsvReader.validate_all_lines_same_number_of_fields job_list.append(job) results = list(executor.map(check_file, job_list)) print(results) for input_file in ConfigInputFiles.input_files: with TsvReader( filename=input_file, num_fields=ConfigNumFields.num_fields, validate_all_lines_same_number_of_fields=ConfigTsvReader. validate_all_lines_same_number_of_fields, check_non_ascii=ConfigTsvReader.check_non_ascii, ) as input_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle, desc=input_file) for _ in input_file_handle: pass
def read() -> None: for input_file in ConfigInputFiles.input_files: with TsvReader(filename=input_file) as input_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle, desc=input_file) for _ in input_file_handle: pass
def sum_columns() -> None: sums = [0] * len(ConfigColumns.columns) with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: for n, i in enumerate(ConfigColumns.columns): sums[n] += float(fields[i]) print(sums)
def cut() -> None: with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: out_fields = [fields[x] for x in ConfigColumns.columns] output_file_handle.write(out_fields)
def multiply() -> None: with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: current_number = int(fields[ConfigColumn.column]) for _ in range(current_number): output_file_handle.write(fields)
def lc() -> None: with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: for i in ConfigColumns.columns: fields[i] = fields[i].lower() output_file_handle.write(fields)
def remove_quotes() -> None: with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: for i in ConfigColumns.columns: if fields[i].startswith("\"") and fields[i].endswith( "\"") and len(fields[i]) > 1: fields[i] = fields[i][1:-1] output_file_handle.write(fields)
def clean_by_field_num() -> None: with TsvReader(filename=ConfigInputFile.input_file, validate_all_lines_same_number_of_fields=False ) as input_file_handle: with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle, desc=ConfigInputFile.input_file) for fields in input_file_handle: if len(fields) == ConfigColumns.columns: output_file_handle.write(fields)
def check_file(params_for_job: ParamsForJob) -> bool: print('checking [{}]...'.format(params_for_job.input_file)) with TsvReader(filename=params_for_job.input_file, num_fields=params_for_job.num_fields, validate_all_lines_same_number_of_fields=params_for_job. validate_all_lines_same_number_of_fields, check_non_ascii=params_for_job.check_non_ascii ) as input_file_handle: if params_for_job.progress: input_file_handle = tqdm.tqdm(input_file_handle) for _ in input_file_handle: pass return True
def drop_duplicates_by_columns() -> None: with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) saw = set() with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: for fields in input_file_handle: match = frozenset([ fields[match_column] for match_column in ConfigColumns.columns ]) if match not in saw: saw.add(match) output_file_handle.write(fields)
def tree() -> None: """ You can also see only parts of the tree """ children_dict: Dict[Set] = defaultdict(set) parents_dict = defaultdict(set) with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle: for fields in input_file_handle: p_parent = fields[ConfigTree.parent_column] p_child = fields[ConfigTree.child_column] children_dict[p_parent].add(p_child) parents_dict[p_child].add(p_parent) # find the roots (parents that have no parents) if ConfigTree.roots: list_of_roots = ConfigTree.roots else: list_of_roots = [] for p_parent in children_dict.keys(): if len(parents_dict[p_parent]) == 0: list_of_roots.append(p_parent) list_to_append = [] first = True for root in list_of_roots: list_to_append.append((root, 0, first, "")) first = False stack = [] stack.extend(list_to_append) # lets draw the tree while len(stack) > 0: name, depth, last, print_list = stack.pop() if last: special_string = u"└──" else: special_string = u"├──" print("{}{}".format(print_list + special_string, name)) first = True list_to_append = [] for p_child in children_dict[name]: if last: special_string = " " else: special_string = u"│ " list_to_append.append( (p_child, depth + 1, first, print_list + special_string)) first = False stack.extend(list(reversed(list_to_append)))
def sample_by_column_old() -> None: weights = [] elements = [] sum_weights = float(0) with TsvReader(ConfigInputFile.input_file) as input_handle: if ConfigProgress.progress: input_handle = tqdm.tqdm(input_handle) for fields in input_handle: elements.append(fields) weight = float(fields[ConfigSampleColumn.sample_column]) sum_weights += weight weights.append(weight) # the following code will only work on python3.6 because the # random.choices API was only introduced then # from random import choices # results = choices(lines, weights, k=size) # this is the same code with numpy weights = [w / sum_weights for w in weights] if ConfigSampleByColumnOld.hits_mode: results_dict = defaultdict(int) for _ in range(ConfigSampleSize.size): current_result = numpy.random.choice( a=len(elements), replace=ConfigReplace.replace, size=1, p=weights, ) current_result = current_result[0] results_dict[current_result] += 1 with TsvWriter(ConfigOutputFile.output_file) as output_handle: for result, hits in results_dict.items(): record = list(elements[result]) record.append(hits) output_handle.write(record) else: results = numpy.random.choice( a=len(elements), replace=ConfigReplace.replace, size=ConfigSampleSize.size, p=weights, ) with TsvWriter(ConfigOutputFile.output_file) as output_handle: for result in results: output_handle.write(elements[result])
def fix_columns() -> None: # We need to read the input file WITHOUT assuming that it hasn't problems with TsvReader( filename=ConfigInputFile.input_file, check_non_ascii=ConfigTsvReader.check_non_ascii, ) as input_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle) with TsvWriter( filename=ConfigOutputFile.output_file) as output_file_handle: for fields in input_file_handle: for fix_column in ConfigColumns.columns: fields[fix_column] = clean( text=fields[fix_column], clean_edges=ConfigFixTypes.clean_edges, sub_trailing=ConfigFixTypes.sub_trailing, remove_non_ascii=ConfigFixTypes.remove_non_ascii, lower_case=ConfigFixTypes.lower_case, ) output_file_handle.write(fields)
def histogram_by_column() -> None: a = [] total = 0 with TsvReader(ConfigInputFile.input_file) as input_handle: for fields in input_handle: a.append(float(fields[ConfigColumn.column])) total += 1 count_in_bucket, bucket_edges = numpy.histogram( a, bins=ConfigBucketNumber.bucket_number) with TsvWriter(ConfigOutputFile.output_file) as output_handle: current_sum = 0 for i, count in enumerate(count_in_bucket): current_sum += count edge_from = bucket_edges[i] edge_to = bucket_edges[i + 1] output_handle.write([ str(edge_from), str(edge_to), str(count), str(int(100.0 * current_sum / total)), ])
def process_single_file(job_info: JobInfo) -> JobReturnValue: logger = logging.getLogger(__name__) tsv_writers_dict = dict() results = dict() with TsvReader( filename=job_info.input_file, check_non_ascii=job_info.check_not_ascii) as input_file_handle: if job_info.progress: logger.info("working on [{job_info.input_file}]") input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: key = ",".join([fields[x] for x in job_info.columns]) if key not in tsv_writers_dict: filename = job_info.pattern.format(key=key, i=job_info.serial) results[key] = filename output_handle = TsvWriter(filename=filename) tsv_writers_dict[key] = output_handle output_handle = tsv_writers_dict[key] output_handle.write(fields) for v in tsv_writers_dict.values(): v.close() return JobReturnValue(job_info.serial, results)
def split_by_columns() -> None: pylogconf.core.setup() logger = logging.getLogger(__name__) assert len(ConfigColumns.columns) > 0, "must provide --columns" tsv_writers_dict = dict() for input_file in ConfigInputFiles.input_files: with TsvReader(filename=input_file, check_non_ascii=ConfigTsvReader.check_non_ascii ) as input_file_handle: if ConfigProgress.progress: logger.info(f"working on [{input_file}]") input_file_handle = tqdm.tqdm(input_file_handle) for fields in input_file_handle: key = ",".join([fields[x] for x in ConfigColumns.columns]) if key not in tsv_writers_dict: filename = ConfigPattern.pattern.format(key=key) output_handle = TsvWriter(filename=filename) tsv_writers_dict[key] = output_handle output_handle = tsv_writers_dict[key] output_handle.write(fields) # close all writers for v in tsv_writers_dict.values(): v.close()
def check_columns_unique() -> None: dicts = [dict() for _ in range(len(ConfigColumns.columns))] errors = False for input_file in ConfigInputFiles.input_files: with TsvReader(filename=input_file, ) as input_file_handle: if ConfigProgress.progress: input_file_handle = tqdm.tqdm(input_file_handle, desc=input_file) for line_number, fields in enumerate(input_file_handle): for i, column in enumerate(ConfigColumns.columns): value = fields[column] if value in dicts[i]: line = dicts[i][value] print( "value [{}] is duplicate on lines [{}, {}]".format( value, line, line_number, )) errors = True else: dicts[i][value] = line_number assert errors is False, "found errors"
def read_all_file(filename: str) -> None: with TsvReader(filename=filename) as input_handle: for _ in input_handle: pass
def testGoodFile(self): g = TsvReader(filename=file_good) g.close()
def tsv_to_csv() -> None: with open(ConfigOutputFile.output_file, "wt") as output_file_handle: csv_writer = csv.writer(output_file_handle) with TsvReader(ConfigInputFile.input_file) as input_file_handle: for fields in input_file_handle: csv_writer.writerow(fields)