Exemple #1
0
def find_sequences_removed_from_remote():
    session_11 = get_session(source_db_name)
    old_accession_ids = set(session_11.query(Sequence.accession_id).all())
    session_11.close()
    session_12 = get_session(target_db_name)
    new_accession_ids = set(session_12.query(Sequence.accession_id).all())
    session_12.close()
    removed_ids = old_accession_ids - new_accession_ids
    print(
        f'Number of sequences present in vcm_gisaid_11 and missing in vcm_gisaid_12: {len(removed_ids)}'
    )
Exemple #2
0
def write_comparable_infos_to_csv(db_name, file_path):
    session = get_session(db_name)
    total_rows = None
    rows_written = 0
    try:
        total_rows = session.query(func.count(Sequence.sequence_id)).scalar()

        with open(file_path, mode='w') as file:
            for source_seq in tqdm(session.query(Sequence) \
                .filter(Sequence.strain_name.isnot(None)) \
                .order_by(Sequence.accession_id).yield_per(100)):

                # interesting atttributes
                string = ','.join([
                    source_seq.accession_id, source_seq.strain_name,
                    str(source_seq.length)
                ])

                file.write(string + '\n')
                rows_written += 1
    except Exception as e:
        print(e)
    finally:
        session.close()

    print(
        f'written {rows_written} / {total_rows}. Missing rows had no strain_name.'
    )
Exemple #3
0
def mark_overlaps():
    """
    These overlaps involve the attributes:
    strain, length from COG-UK (table Sequence)
    strain, length from GISAID (table Sequence)
    """
    source_session = get_session(source_db_name)
    target_session = get_session(target_db_name)
    cleanup_overlap_tables(source_session, target_session)
    global total_only_strain_1_to_n, total_strain_plus_length_1_to_n, output_record, total_only_strain_final, \
        total_strain_plus_length_final, total_only_strain_1_to_1, total_strain_plus_length_1_to_1

    try:
        count_source_seq = source_sequences(
            session=source_session,
            database_source=source_database_source,
            for_overlaps_with_target_source=target_name,
            count_only=True)
        for source_seq in tqdm(
                total=count_source_seq,
                iterable=source_sequences(
                    session=source_session,
                    database_source=source_database_source,
                    for_overlaps_with_target_source=target_name)):
            # for coguk_id in tqdm(read_coguk_overlapping_ids_from_file()):
            #     source_seq = source_session.query(Sequence).filter(Sequence.accession_id == coguk_id).one_or_none()
            #     if source_seq is None:
            #         continue

            only_strain = []
            strain_plus_length = []

            target_seq_query = target_sequences(
                target_session, matching_strain=source_seq.strain_name)

            for target_seq in target_seq_query:
                if target_seq.length == source_seq.length:
                    strain_plus_length.append(target_seq)
                else:
                    only_strain.append(target_seq)

            if len(strain_plus_length) > 0:
                ids = [i.accession_id for i in strain_plus_length]
                composed_string = f'{source_seq.accession_id} matches with {target_name} strain+length on {ids}.'
                if len(strain_plus_length) > 1:
                    composed_string += f' Only {ids[0]} is used for copying metadata'
                    total_strain_plus_length_1_to_n += 1
                else:
                    total_strain_plus_length_1_to_1 += 1
                output_record.append(composed_string)
                total_strain_plus_length_final += 1
                put_gisaid_metadata_into_coguk(
                    source_session, source_seq.accession_id, target_session,
                    strain_plus_length[0].accession_id)
                insert_overlaps_in_db(source_session, target_session,
                                      source_seq, strain_plus_length, 'COG-UK',
                                      target_name)
            elif len(only_strain) > 0:
                ids = [i.accession_id for i in only_strain]
                composed_string = f'{source_seq.accession_id} matches with {target_name} strain only on {ids}.'
                if len(only_strain) > 1:
                    composed_string += f' Only {ids[0]} is used for copying metadata'
                    total_only_strain_1_to_n += 1
                else:
                    total_only_strain_1_to_1 += 1
                output_record.append(composed_string)
                total_only_strain_final += 1
                put_gisaid_metadata_into_coguk(source_session,
                                               source_seq.accession_id,
                                               target_session,
                                               only_strain[0].accession_id)
                insert_overlaps_in_db(source_session, target_session,
                                      source_seq, only_strain, 'COG-UK',
                                      target_name)

        if user_asked_to_commit:
            source_session.commit()
            target_session.commit()

        # set gisaid_only
        set_gisaid_only_based_on_overlap_table(target_session)
        if user_asked_to_commit:
            target_session.commit()

    except KeyboardInterrupt:
        rollback(source_session)
        rollback(target_session)
        output_record.append(
            "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!")
    except Exception as e:
        logger.exception("")
        logger.info("rollback of changes")
        rollback(source_session)
        rollback(target_session)
        output_record.append(
            "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!")
    finally:
        source_session.close()
        target_session.close()

        totals_string = f'TOTALS:\n' \
                        f'1-1 PAIRS strain+length: {total_strain_plus_length_1_to_1} -- only strain {total_only_strain_1_to_1}\n' \
                        f'1-N PAIRS strain+length: {total_strain_plus_length_1_to_n} -- only strain {total_only_strain_1_to_n}\n' \
                        f'PAIRS actually used for updating {source_name} metadata:' \
                        f'strain+length: {total_strain_plus_length_final} -- only strain {total_only_strain_final}\n'
        logger.info(totals_string)

        output_path = f'.{sep}overlaps{sep}{source_name}_{target_name}{sep}'
        output_path += f'{source_name}@{source_db_name}_overlapping_{target_name}@{target_db_name}'
        output_path += f'_{date.today().strftime("%Y-%b-%d")}.txt'
        output_path = output_path.lower()
        with open(file=output_path, mode='w') as w:
            for line in output_record:
                w.write(line + "\n")
            w.write(totals_string)
Exemple #4
0
def mark_overlaps():
    session = get_session(db_name)
    cleanup_overlap_tables(session)
    global total_only_strain_1_to_n, total_strain_plus_length_1_to_n, output_record, total_only_strain_1_to_1, total_strain_plus_length_1_to_1

    try:
        count_source_seq = source_sequences(
            session=session,
            database_source=source_database_source,
            virus_taxon_name='Severe acute respiratory syndrome coronavirus 2',
            for_overlaps_with_target_source=target_name,
            count_only=True)
        for source_seq in tqdm(
                total=count_source_seq,
                iterable=source_sequences(
                    session=session,
                    database_source=source_database_source,
                    virus_taxon_name=
                    'Severe acute respiratory syndrome coronavirus 2',
                    for_overlaps_with_target_source=target_name)):

            only_strain = []
            strain_plus_length = []

            target_seq_query = target_sequences(
                session=session,
                matching_strain=source_seq.strain_name,
                database_source=target_database_source)

            for target_seq in target_seq_query:
                if target_seq.length == source_seq.length:
                    strain_plus_length.append(target_seq)
                else:
                    only_strain.append(target_seq)

            if len(strain_plus_length) > 0:
                if len(strain_plus_length) > 1:
                    output_record.append('\t\tWARN\t\t')
                    total_strain_plus_length_1_to_n += 1
                else:
                    total_strain_plus_length_1_to_1 += 1
                acc_ids = [s.accession_id for s in strain_plus_length]
                output_record.append(
                    f'{source_seq.accession_id} matches with {target_name} strain+length on {acc_ids}'
                )
                insert_overlaps_in_db(session, session, source_seq,
                                      strain_plus_length, source_name,
                                      target_name)
            elif len(only_strain) > 0:
                if len(only_strain) > 1:
                    output_record.append('\t\tWARN\t\t')
                    total_only_strain_1_to_n += 1
                else:
                    total_only_strain_1_to_1 += 1
                acc_ids = [s.accession_id for s in only_strain]
                output_record.append(
                    f'{source_seq.accession_id} matches with {target_name} strain on {acc_ids}'
                )
                insert_overlaps_in_db(session, session, source_seq,
                                      only_strain, source_name, target_name)

        if user_asked_to_commit:
            session.commit()
    except KeyboardInterrupt:
        rollback(session)
        output_record.append(
            "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!")
    except Exception as e:
        logger.exception("")
        rollback(session)
        output_record.append(
            "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!")
    finally:
        session.close()

        totals_string = f'TOTALS:\n' \
                        f'1-1 MATCHES: strain+length: {total_strain_plus_length_1_to_1} -- only strain {total_only_strain_1_to_1}\n' \
                        f'1-N MATCHES: strain+length: {total_strain_plus_length_1_to_n} -- only strain {total_only_strain_1_to_n} (search "WARN" to find \'em)\n'
        logger.info(totals_string)

        output_path = f'.{sep}overlaps{sep}{source_name}_{target_name}{sep}'
        output_path += f'{source_name}@{db_name}_overlapping_{target_name}@{db_name}'
        output_path += f'_{date.today().strftime("%Y-%b-%d")}.txt'
        output_path = output_path.lower()
        with open(file=output_path, mode='w') as w:
            for line in output_record:
                w.write(line + "\n")
            w.write(totals_string)
Exemple #5
0
def mark_overlaps():
    """
    These overlaps involve the attributes:
    strain, length, country from GenBank (tables Sequence and HostSample)
    strain, length from GISAID (table Sequence)
    """
    source_session = get_session(source_db_name)
    target_session = get_session(target_db_name)
    cleanup_overlap_tables(source_session, target_session)
    all_target_ids_changed = set()
    # dest_to_source_matches = {}
    global total_only_strain_1_to_n, total_strain_plus_length_1_to_n, output_record, total_only_strain_1_to_1, total_strain_plus_length_1_to_1, gisaid_only_false_tuples

    try:
        count_source_seq = source_sequences(
            session=source_session,
            database_source=source_database_source,
            virus_taxon_name='Severe acute respiratory syndrome coronavirus 2',
            count_only=True)
        for source_seq, source_host in tqdm(
                total=count_source_seq,
                iterable=source_sequences(
                    session=source_session,
                    database_source=source_database_source,
                    virus_taxon_name=
                    'Severe acute respiratory syndrome coronavirus 2')):

            only_strain = []
            strain_plus_length = []

            target_seq_query = target_sequences(
                target_session, matching_strain=source_seq.strain_name)

            for target_seq in target_seq_query:
                if target_seq.length == source_seq.length:
                    strain_plus_length.append(target_seq)
                else:
                    only_strain.append(target_seq)

            if len(strain_plus_length) > 0:
                if len(strain_plus_length
                       ) > 1 and source_host.country is not None:
                    strain_plus_length = [
                        a for a in strain_plus_length
                        if (source_host.country.lower() in
                            a.strain_name.strip().lower())
                    ]
                if len(strain_plus_length
                       ) > 1 and source_seq.strain_name.isnumeric():
                    candidates_from_switzerland = [
                        g for g in strain_plus_length
                        if "Switzerland" in g.strain_name
                    ]
                    candidates_from_england = [
                        h for h in strain_plus_length
                        if "England" in h.strain_name
                    ]
                    if len(candidates_from_switzerland) == 1:
                        strain_plus_length = candidates_from_switzerland
                        ids = [i.accession_id for i in strain_plus_length]
                        composed_string = f'{source_seq.accession_id} matches with {target_name} strain+length+only_numerical_strain+Switzerland_origin on {ids}.'
                    elif len(candidates_from_england) == len(
                            strain_plus_length):
                        # if all targets are England sequences, it is a fake match
                        strain_plus_length = []
                        composed_string = ''
                    else:
                        strain_plus_length = filter_exact_strain_number_match(
                            source_seq.strain_name, strain_plus_length)
                        ids = [i.accession_id for i in strain_plus_length]
                        composed_string = f'{source_seq.accession_id} matches with {target_name} strain+length on {ids}.'
                else:
                    strain_plus_length = filter_most_exact_strain_match(
                        source_seq.strain_name, strain_plus_length)
                    ids = [i.accession_id for i in strain_plus_length]
                    composed_string = f'{source_seq.accession_id} matches with {target_name} strain+length (possibly also country) on {ids}.'
                if len(strain_plus_length) > 1:
                    total_strain_plus_length_1_to_n += 1
                    composed_string += f' WARNING:  1-N MATCH'
                elif len(strain_plus_length) == 1:
                    total_strain_plus_length_1_to_1 += 1
                if strain_plus_length:
                    output_record.append(composed_string)
                    output_record.append(
                        f'\tstrain values: GENBANK: {source_seq.strain_name} GISAID: {[b.strain_name for b in strain_plus_length]}'
                    )

                    if len(set([s.sequence_id for s in strain_plus_length
                                ])) != len(strain_plus_length):
                        difference = len(strain_plus_length) - len(
                            set([s.accession_id for s in strain_plus_length]))
                        logger.error(f'mismatch of {difference}')
                    for g in strain_plus_length:
                        gisaid_only_false_tuples += 1
                        all_target_ids_changed.add(g.accession_id)

                        # UPDATE DEST TO SOURCE RELATION
                        # if not dest_to_source_matches.get(g.accession_id):
                        #     dest_to_source_matches[g.accession_id] = {
                        #         'GISAID_strain': g.strain_name,
                        #         'GenBank-ids': [source_seq.accession_id],
                        #         'GenBank-strains': [source_seq.strain_name]
                        #     }
                        # else:
                        #     dest_elem = dest_to_source_matches[g.accession_id]
                        #     dest_elem['GenBank-ids'].append(source_seq.accession_id)
                        #     dest_elem['GenBank-strains'].append(source_seq.strain_name)
                    insert_overlaps_in_db(source_session, target_session,
                                          source_seq, strain_plus_length,
                                          source_name, target_name)
            elif len(only_strain) > 0:
                if len(only_strain) > 1 and source_host.country is not None:
                    only_strain = [
                        c for c in only_strain
                        if (source_host.country.lower() in
                            c.strain_name.strip().lower())
                    ]
                if len(only_strain) > 1 and source_seq.strain_name.isnumeric():
                    candidates_from_switzerland = [
                        g for g in only_strain
                        if "Switzerland" in g.strain_name
                    ]
                    candidates_from_england = [
                        h for h in only_strain if "England" in h.strain_name
                    ]
                    if len(candidates_from_switzerland) == 1:
                        only_strain = candidates_from_switzerland
                        ids = [i.accession_id for i in only_strain]
                        composed_string = f'{source_seq.accession_id} matches with {target_name} strain_only+only_numerical_strain+Switzerland_origin on {ids}.'
                    elif len(candidates_from_england) == len(only_strain):
                        # if all targets are England sequences, it is a fake match
                        only_strain = []
                        composed_string = ''
                    else:
                        only_strain = filter_exact_strain_number_match(
                            source_seq.strain_name, only_strain)
                        ids = [i.accession_id for i in only_strain]
                        composed_string = f'{source_seq.accession_id} matches with {target_name} strain_only on {ids}.'
                else:
                    only_strain = filter_most_exact_strain_match(
                        source_seq.strain_name, only_strain)
                    ids = [i.accession_id for i in only_strain]
                    composed_string = f'{source_seq.accession_id} matches with {target_name} strain_only (possibly also country) on {ids}.'
                if len(only_strain) > 1:
                    total_only_strain_1_to_n += 1
                    composed_string += f' WARNING:  1-N MATCH'
                elif len(only_strain) == 1:
                    total_only_strain_1_to_1 += 1
                if only_strain:
                    output_record.append(composed_string)
                    output_record.append(
                        f'\tstrain values: GENBANK: {source_seq.strain_name} GISAID: {[y.strain_name for y in only_strain]}'
                    )

                    if len(set([s.sequence_id
                                for s in only_strain])) != len(only_strain):
                        difference = len(only_strain) - len(
                            set([s.accession_id for s in only_strain]))
                        logger.error(f'mismatch of {difference}')
                    for g in only_strain:
                        gisaid_only_false_tuples += 1
                        all_target_ids_changed.add(g.accession_id)

                        # UPDATE DEST TO SOURCE RELATION
                        # if not dest_to_source_matches.get(g.accession_id):
                        #     dest_to_source_matches[g.accession_id] = {
                        #         'GISAID_strain': g.strain_name,
                        #         'GenBank-ids': [source_seq.accession_id],
                        #         'GenBank-strains': [source_seq.strain_name]
                        #     }
                        # else:
                        #     dest_elem = dest_to_source_matches[g.accession_id]
                        #     dest_elem['GenBank-ids'].append(source_seq.accession_id)
                        #     dest_elem['GenBank-strains'].append(source_seq.strain_name)
                    insert_overlaps_in_db(source_session, target_session,
                                          source_seq, only_strain, source_name,
                                          target_name)

        logger.debug(
            f'total number of target sequences changed {len(all_target_ids_changed)}'
        )
        if user_asked_to_commit:
            target_session.commit()
            source_session.commit()

        # set gisaid_only
        set_gisaid_only_based_on_overlap_table(target_session)
        if user_asked_to_commit:
            target_session.commit()
    except KeyboardInterrupt:
        rollback(source_session)
        rollback(target_session)
        output_record.append(
            "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!")
    except Exception as e:
        logger.exception("")
        rollback(source_session)
        rollback(target_session)
        output_record.append(
            "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!")
    finally:
        source_session.close()
        target_session.close()

        totals_string = f'TOTALS:\n' \
                        f'1-1 MATCHES: strain+length: {total_strain_plus_length_1_to_1} -- only strain {total_only_strain_1_to_1}\n' \
                        f'1-N MATCHES: strain+length: {total_strain_plus_length_1_to_n} -- only strain {total_only_strain_1_to_n} (search "WARN" to find \'em)\n' \
                        f'{target_name} tuples in which to set SEQUENCE.gisaid_only to False {gisaid_only_false_tuples}\n'

        logger.info(totals_string)

        output_path = f'.{sep}overlaps{sep}{source_name}_{target_name}{sep}'
        output_path += f'{source_name}@{source_db_name}_overlapping_{target_name}@{target_db_name}'
        output_path += f'_{date.today().strftime("%Y-%b-%d")}.txt'
        output_path = output_path.lower()
        with open(file=output_path, mode='w') as w:
            for line in output_record:
                w.write(line + "\n")
            w.write(totals_string)
            w.write('\n\n')