Beispiel #1
0
def join() -> None:
    d = dict()
    event_found = 0
    event_unknown_added = 0
    event_discarded = 0
    with TsvReader(ConfigJoin.hash_file) as hash_file_handle:
        if ConfigProgress.progress:
            hash_file_handle = tqdm.tqdm(hash_file_handle, desc="reading hash")
        for fields in hash_file_handle:
            key = fields[ConfigJoin.hash_key_column]
            value = fields[ConfigJoin.hash_value_column]
            d[key] = value
    with TsvReader(ConfigInputFile.input_file) as input_file_handle, \
            TsvWriter(ConfigOutputFile.output_file) as output_file_handle:
        if ConfigProgress.progress:
            input_file_handle = tqdm.tqdm(
                input_file_handle, desc="reading input and writing output")
        for fields in input_file_handle:
            key = fields[ConfigJoin.input_key_column]
            if key in d:
                event_found += 1
                new_value = d[key]
                fields.insert(ConfigJoin.output_insert_column, new_value)
                output_file_handle.write(fields)
            else:
                if ConfigJoin.output_add_unknown:
                    event_unknown_added += 1
                    fields.insert(ConfigJoin.output_insert_column, "unknown")
                    output_file_handle.write(fields)
                else:
                    event_discarded += 1
    print("event_found {}".format(event_found))
    print("event_unknown_added {}".format(event_unknown_added))
    print("event_discarded {}".format(event_discarded))
Beispiel #2
0
def majority() -> None:
    """
    This means that if x1 appears more
    with y2 than any other values in column Y then x1, y2 will be in the output
    and no other entry with x1 will appear
    """
    d: Dict[Dict[str, int]] = defaultdict(dict)
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        if ConfigProgress.progress:
            input_file_handle = tqdm.tqdm(input_file_handle)
        for fields in input_file_handle:
            p_first = fields[ConfigMajority.input_first_column]
            p_second = fields[ConfigMajority.input_second_column]
            p_multiplication = int(
                fields[ConfigMajority.input_multiplication_column])
            if p_second not in d[p_first]:
                d[p_first][p_second] = 0
            d[p_first][p_second] += p_multiplication
    with TsvWriter(
            filename=ConfigOutputFile.output_file) as output_file_handle:
        for p_first, p_dict in d.items():
            p_second = max(p_dict.keys(),
                           key=lambda x, closure_dict=p_dict: closure_dict[x])
            p_count = p_dict[p_second]
            output_file_handle.write([
                p_first,
                p_second,
                str(p_count),
            ])
Beispiel #3
0
def sample_by_column_old() -> None:
    weights = []
    elements = []
    sum_weights = float(0)
    with TsvReader(ConfigInputFile.input_file) as input_handle:
        if ConfigProgress.progress:
            input_handle = tqdm.tqdm(input_handle)
        for fields in input_handle:
            elements.append(fields)
            weight = float(fields[ConfigSampleColumn.sample_column])
            sum_weights += weight
            weights.append(weight)
    # the following code will only work on python3.6 because the
    # random.choices API was only introduced then
    # from random import choices
    # results = choices(lines, weights, k=size)

    # this is the same code with numpy
    weights = [w / sum_weights for w in weights]
    if ConfigSampleByColumnOld.hits_mode:
        results_dict = defaultdict(int)
        for _ in range(ConfigSampleSize.size):
            current_result = numpy.random.choice(
                a=len(elements),
                replace=ConfigReplace.replace,
                size=1,
                p=weights,
            )
            current_result = current_result[0]
            results_dict[current_result] += 1
        with TsvWriter(ConfigOutputFile.output_file) as output_handle:
            for result, hits in results_dict.items():
                record = list(elements[result])
                record.append(hits)
                output_handle.write(record)
    else:
        results = numpy.random.choice(
            a=len(elements),
            replace=ConfigReplace.replace,
            size=ConfigSampleSize.size,
            p=weights,
        )
        with TsvWriter(ConfigOutputFile.output_file) as output_handle:
            for result in results:
                output_handle.write(elements[result])
Beispiel #4
0
def cut() -> None:
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle)
            for fields in input_file_handle:
                out_fields = [fields[x] for x in ConfigColumns.columns]
                output_file_handle.write(out_fields)
Beispiel #5
0
def lc() -> None:
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle)
            for fields in input_file_handle:
                for i in ConfigColumns.columns:
                    fields[i] = fields[i].lower()
                output_file_handle.write(fields)
Beispiel #6
0
def multiply() -> None:
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle)
            for fields in input_file_handle:
                current_number = int(fields[ConfigColumn.column])
                for _ in range(current_number):
                    output_file_handle.write(fields)
Beispiel #7
0
def remove_quotes() -> None:
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle)
            for fields in input_file_handle:
                for i in ConfigColumns.columns:
                    if fields[i].startswith("\"") and fields[i].endswith(
                            "\"") and len(fields[i]) > 1:
                        fields[i] = fields[i][1:-1]
                output_file_handle.write(fields)
Beispiel #8
0
def clean_by_field_num() -> None:
    with TsvReader(filename=ConfigInputFile.input_file,
                   validate_all_lines_same_number_of_fields=False
                   ) as input_file_handle:
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            if ConfigProgress.progress:
                input_file_handle = tqdm.tqdm(input_file_handle,
                                              desc=ConfigInputFile.input_file)
            for fields in input_file_handle:
                if len(fields) == ConfigColumns.columns:
                    output_file_handle.write(fields)
Beispiel #9
0
def process_single_file(job_info: JobInfo) -> JobReturnValue:
    logger = logging.getLogger(__name__)
    tsv_writers_dict = dict()
    results = dict()
    with TsvReader(
            filename=job_info.input_file,
            check_non_ascii=job_info.check_not_ascii) as input_file_handle:
        if job_info.progress:
            logger.info("working on [{job_info.input_file}]")
            input_file_handle = tqdm.tqdm(input_file_handle)
        for fields in input_file_handle:
            key = ",".join([fields[x] for x in job_info.columns])
            if key not in tsv_writers_dict:
                filename = job_info.pattern.format(key=key, i=job_info.serial)
                results[key] = filename
                output_handle = TsvWriter(filename=filename)
                tsv_writers_dict[key] = output_handle
            output_handle = tsv_writers_dict[key]
            output_handle.write(fields)
    for v in tsv_writers_dict.values():
        v.close()
    return JobReturnValue(job_info.serial, results)
Beispiel #10
0
def split_by_columns() -> None:
    pylogconf.core.setup()
    logger = logging.getLogger(__name__)
    assert len(ConfigColumns.columns) > 0, "must provide --columns"
    tsv_writers_dict = dict()
    for input_file in ConfigInputFiles.input_files:
        with TsvReader(filename=input_file,
                       check_non_ascii=ConfigTsvReader.check_non_ascii
                       ) as input_file_handle:
            if ConfigProgress.progress:
                logger.info(f"working on [{input_file}]")
                input_file_handle = tqdm.tqdm(input_file_handle)
            for fields in input_file_handle:
                key = ",".join([fields[x] for x in ConfigColumns.columns])
                if key not in tsv_writers_dict:
                    filename = ConfigPattern.pattern.format(key=key)
                    output_handle = TsvWriter(filename=filename)
                    tsv_writers_dict[key] = output_handle
                output_handle = tsv_writers_dict[key]
                output_handle.write(fields)
    # close all writers
    for v in tsv_writers_dict.values():
        v.close()
Beispiel #11
0
def csv_to_tsv() -> None:
    if ConfigCsvToTsv.set_max:
        csv.field_size_limit(sys.maxsize)
    with pyanyzip.core.openzip(ConfigInputFile.input_file,
                               "rt") as input_file_handle:
        csv_reader = csv.reader(input_file_handle)
        with TsvWriter(
                filename=ConfigOutputFile.output_file,
                check_num_fields=ConfigCsvToTsv.check_num_fields,
        ) as output_file_handle:
            for row in csv_reader:
                if ConfigCsvToTsv.replace_tabs_with_spaces:
                    for i, _item in enumerate(row):
                        row[i] = row[i].replace("\t", " ")
                output_file_handle.write(row)
Beispiel #12
0
def drop_duplicates_by_columns() -> None:
    with TsvReader(filename=ConfigInputFile.input_file) as input_file_handle:
        if ConfigProgress.progress:
            input_file_handle = tqdm.tqdm(input_file_handle)
        saw = set()
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            for fields in input_file_handle:
                match = frozenset([
                    fields[match_column]
                    for match_column in ConfigColumns.columns
                ])
                if match not in saw:
                    saw.add(match)
                    output_file_handle.write(fields)
Beispiel #13
0
def fix_columns() -> None:
    # We need to read the input file WITHOUT assuming that it hasn't problems
    with TsvReader(
            filename=ConfigInputFile.input_file,
            check_non_ascii=ConfigTsvReader.check_non_ascii,
    ) as input_file_handle:
        if ConfigProgress.progress:
            input_file_handle = tqdm.tqdm(input_file_handle)
        with TsvWriter(
                filename=ConfigOutputFile.output_file) as output_file_handle:
            for fields in input_file_handle:
                for fix_column in ConfigColumns.columns:
                    fields[fix_column] = clean(
                        text=fields[fix_column],
                        clean_edges=ConfigFixTypes.clean_edges,
                        sub_trailing=ConfigFixTypes.sub_trailing,
                        remove_non_ascii=ConfigFixTypes.remove_non_ascii,
                        lower_case=ConfigFixTypes.lower_case,
                    )
                output_file_handle.write(fields)
Beispiel #14
0
def histogram_by_column() -> None:
    a = []
    total = 0
    with TsvReader(ConfigInputFile.input_file) as input_handle:
        for fields in input_handle:
            a.append(float(fields[ConfigColumn.column]))
            total += 1
    count_in_bucket, bucket_edges = numpy.histogram(
        a, bins=ConfigBucketNumber.bucket_number)
    with TsvWriter(ConfigOutputFile.output_file) as output_handle:
        current_sum = 0
        for i, count in enumerate(count_in_bucket):
            current_sum += count
            edge_from = bucket_edges[i]
            edge_to = bucket_edges[i + 1]
            output_handle.write([
                str(edge_from),
                str(edge_to),
                str(count),
                str(int(100.0 * current_sum / total)),
            ])