def mark_overlaps(): session = get_session(db_name) cleanup_overlap_tables(session) global total_only_strain_1_to_n, total_strain_plus_length_1_to_n, output_record, total_only_strain_1_to_1, total_strain_plus_length_1_to_1 try: count_source_seq = source_sequences( session=session, database_source=source_database_source, virus_taxon_name='Severe acute respiratory syndrome coronavirus 2', for_overlaps_with_target_source=target_name, count_only=True) for source_seq in tqdm( total=count_source_seq, iterable=source_sequences( session=session, database_source=source_database_source, virus_taxon_name= 'Severe acute respiratory syndrome coronavirus 2', for_overlaps_with_target_source=target_name)): only_strain = [] strain_plus_length = [] target_seq_query = target_sequences( session=session, matching_strain=source_seq.strain_name, database_source=target_database_source) for target_seq in target_seq_query: if target_seq.length == source_seq.length: strain_plus_length.append(target_seq) else: only_strain.append(target_seq) if len(strain_plus_length) > 0: if len(strain_plus_length) > 1: output_record.append('\t\tWARN\t\t') total_strain_plus_length_1_to_n += 1 else: total_strain_plus_length_1_to_1 += 1 acc_ids = [s.accession_id for s in strain_plus_length] output_record.append( f'{source_seq.accession_id} matches with {target_name} strain+length on {acc_ids}' ) insert_overlaps_in_db(session, session, source_seq, strain_plus_length, source_name, target_name) elif len(only_strain) > 0: if len(only_strain) > 1: output_record.append('\t\tWARN\t\t') total_only_strain_1_to_n += 1 else: total_only_strain_1_to_1 += 1 acc_ids = [s.accession_id for s in only_strain] output_record.append( f'{source_seq.accession_id} matches with {target_name} strain on {acc_ids}' ) insert_overlaps_in_db(session, session, source_seq, only_strain, source_name, target_name) if user_asked_to_commit: session.commit() except KeyboardInterrupt: rollback(session) output_record.append( "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!") except Exception as e: logger.exception("") rollback(session) output_record.append( "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!") finally: session.close() totals_string = f'TOTALS:\n' \ f'1-1 MATCHES: strain+length: {total_strain_plus_length_1_to_1} -- only strain {total_only_strain_1_to_1}\n' \ f'1-N MATCHES: strain+length: {total_strain_plus_length_1_to_n} -- only strain {total_only_strain_1_to_n} (search "WARN" to find \'em)\n' logger.info(totals_string) output_path = f'.{sep}overlaps{sep}{source_name}_{target_name}{sep}' output_path += f'{source_name}@{db_name}_overlapping_{target_name}@{db_name}' output_path += f'_{date.today().strftime("%Y-%b-%d")}.txt' output_path = output_path.lower() with open(file=output_path, mode='w') as w: for line in output_record: w.write(line + "\n") w.write(totals_string)
def mark_overlaps(): """ These overlaps involve the attributes: strain, length from COG-UK (table Sequence) strain, length from GISAID (table Sequence) """ source_session = get_session(source_db_name) target_session = get_session(target_db_name) cleanup_overlap_tables(source_session, target_session) global total_only_strain_1_to_n, total_strain_plus_length_1_to_n, output_record, total_only_strain_final, \ total_strain_plus_length_final, total_only_strain_1_to_1, total_strain_plus_length_1_to_1 try: count_source_seq = source_sequences( session=source_session, database_source=source_database_source, for_overlaps_with_target_source=target_name, count_only=True) for source_seq in tqdm( total=count_source_seq, iterable=source_sequences( session=source_session, database_source=source_database_source, for_overlaps_with_target_source=target_name)): # for coguk_id in tqdm(read_coguk_overlapping_ids_from_file()): # source_seq = source_session.query(Sequence).filter(Sequence.accession_id == coguk_id).one_or_none() # if source_seq is None: # continue only_strain = [] strain_plus_length = [] target_seq_query = target_sequences( target_session, matching_strain=source_seq.strain_name) for target_seq in target_seq_query: if target_seq.length == source_seq.length: strain_plus_length.append(target_seq) else: only_strain.append(target_seq) if len(strain_plus_length) > 0: ids = [i.accession_id for i in strain_plus_length] composed_string = f'{source_seq.accession_id} matches with {target_name} strain+length on {ids}.' if len(strain_plus_length) > 1: composed_string += f' Only {ids[0]} is used for copying metadata' total_strain_plus_length_1_to_n += 1 else: total_strain_plus_length_1_to_1 += 1 output_record.append(composed_string) total_strain_plus_length_final += 1 put_gisaid_metadata_into_coguk( source_session, source_seq.accession_id, target_session, strain_plus_length[0].accession_id) insert_overlaps_in_db(source_session, target_session, source_seq, strain_plus_length, 'COG-UK', target_name) elif len(only_strain) > 0: ids = [i.accession_id for i in only_strain] composed_string = f'{source_seq.accession_id} matches with {target_name} strain only on {ids}.' if len(only_strain) > 1: composed_string += f' Only {ids[0]} is used for copying metadata' total_only_strain_1_to_n += 1 else: total_only_strain_1_to_1 += 1 output_record.append(composed_string) total_only_strain_final += 1 put_gisaid_metadata_into_coguk(source_session, source_seq.accession_id, target_session, only_strain[0].accession_id) insert_overlaps_in_db(source_session, target_session, source_seq, only_strain, 'COG-UK', target_name) if user_asked_to_commit: source_session.commit() target_session.commit() # set gisaid_only set_gisaid_only_based_on_overlap_table(target_session) if user_asked_to_commit: target_session.commit() except KeyboardInterrupt: rollback(source_session) rollback(target_session) output_record.append( "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!") except Exception as e: logger.exception("") logger.info("rollback of changes") rollback(source_session) rollback(target_session) output_record.append( "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!") finally: source_session.close() target_session.close() totals_string = f'TOTALS:\n' \ f'1-1 PAIRS strain+length: {total_strain_plus_length_1_to_1} -- only strain {total_only_strain_1_to_1}\n' \ f'1-N PAIRS strain+length: {total_strain_plus_length_1_to_n} -- only strain {total_only_strain_1_to_n}\n' \ f'PAIRS actually used for updating {source_name} metadata:' \ f'strain+length: {total_strain_plus_length_final} -- only strain {total_only_strain_final}\n' logger.info(totals_string) output_path = f'.{sep}overlaps{sep}{source_name}_{target_name}{sep}' output_path += f'{source_name}@{source_db_name}_overlapping_{target_name}@{target_db_name}' output_path += f'_{date.today().strftime("%Y-%b-%d")}.txt' output_path = output_path.lower() with open(file=output_path, mode='w') as w: for line in output_record: w.write(line + "\n") w.write(totals_string)
def mark_overlaps(): """ These overlaps involve the attributes: strain, length, country from GenBank (tables Sequence and HostSample) strain, length from GISAID (table Sequence) """ source_session = get_session(source_db_name) target_session = get_session(target_db_name) cleanup_overlap_tables(source_session, target_session) all_target_ids_changed = set() # dest_to_source_matches = {} global total_only_strain_1_to_n, total_strain_plus_length_1_to_n, output_record, total_only_strain_1_to_1, total_strain_plus_length_1_to_1, gisaid_only_false_tuples try: count_source_seq = source_sequences( session=source_session, database_source=source_database_source, virus_taxon_name='Severe acute respiratory syndrome coronavirus 2', count_only=True) for source_seq, source_host in tqdm( total=count_source_seq, iterable=source_sequences( session=source_session, database_source=source_database_source, virus_taxon_name= 'Severe acute respiratory syndrome coronavirus 2')): only_strain = [] strain_plus_length = [] target_seq_query = target_sequences( target_session, matching_strain=source_seq.strain_name) for target_seq in target_seq_query: if target_seq.length == source_seq.length: strain_plus_length.append(target_seq) else: only_strain.append(target_seq) if len(strain_plus_length) > 0: if len(strain_plus_length ) > 1 and source_host.country is not None: strain_plus_length = [ a for a in strain_plus_length if (source_host.country.lower() in a.strain_name.strip().lower()) ] if len(strain_plus_length ) > 1 and source_seq.strain_name.isnumeric(): candidates_from_switzerland = [ g for g in strain_plus_length if "Switzerland" in g.strain_name ] candidates_from_england = [ h for h in strain_plus_length if "England" in h.strain_name ] if len(candidates_from_switzerland) == 1: strain_plus_length = candidates_from_switzerland ids = [i.accession_id for i in strain_plus_length] composed_string = f'{source_seq.accession_id} matches with {target_name} strain+length+only_numerical_strain+Switzerland_origin on {ids}.' elif len(candidates_from_england) == len( strain_plus_length): # if all targets are England sequences, it is a fake match strain_plus_length = [] composed_string = '' else: strain_plus_length = filter_exact_strain_number_match( source_seq.strain_name, strain_plus_length) ids = [i.accession_id for i in strain_plus_length] composed_string = f'{source_seq.accession_id} matches with {target_name} strain+length on {ids}.' else: strain_plus_length = filter_most_exact_strain_match( source_seq.strain_name, strain_plus_length) ids = [i.accession_id for i in strain_plus_length] composed_string = f'{source_seq.accession_id} matches with {target_name} strain+length (possibly also country) on {ids}.' if len(strain_plus_length) > 1: total_strain_plus_length_1_to_n += 1 composed_string += f' WARNING: 1-N MATCH' elif len(strain_plus_length) == 1: total_strain_plus_length_1_to_1 += 1 if strain_plus_length: output_record.append(composed_string) output_record.append( f'\tstrain values: GENBANK: {source_seq.strain_name} GISAID: {[b.strain_name for b in strain_plus_length]}' ) if len(set([s.sequence_id for s in strain_plus_length ])) != len(strain_plus_length): difference = len(strain_plus_length) - len( set([s.accession_id for s in strain_plus_length])) logger.error(f'mismatch of {difference}') for g in strain_plus_length: gisaid_only_false_tuples += 1 all_target_ids_changed.add(g.accession_id) # UPDATE DEST TO SOURCE RELATION # if not dest_to_source_matches.get(g.accession_id): # dest_to_source_matches[g.accession_id] = { # 'GISAID_strain': g.strain_name, # 'GenBank-ids': [source_seq.accession_id], # 'GenBank-strains': [source_seq.strain_name] # } # else: # dest_elem = dest_to_source_matches[g.accession_id] # dest_elem['GenBank-ids'].append(source_seq.accession_id) # dest_elem['GenBank-strains'].append(source_seq.strain_name) insert_overlaps_in_db(source_session, target_session, source_seq, strain_plus_length, source_name, target_name) elif len(only_strain) > 0: if len(only_strain) > 1 and source_host.country is not None: only_strain = [ c for c in only_strain if (source_host.country.lower() in c.strain_name.strip().lower()) ] if len(only_strain) > 1 and source_seq.strain_name.isnumeric(): candidates_from_switzerland = [ g for g in only_strain if "Switzerland" in g.strain_name ] candidates_from_england = [ h for h in only_strain if "England" in h.strain_name ] if len(candidates_from_switzerland) == 1: only_strain = candidates_from_switzerland ids = [i.accession_id for i in only_strain] composed_string = f'{source_seq.accession_id} matches with {target_name} strain_only+only_numerical_strain+Switzerland_origin on {ids}.' elif len(candidates_from_england) == len(only_strain): # if all targets are England sequences, it is a fake match only_strain = [] composed_string = '' else: only_strain = filter_exact_strain_number_match( source_seq.strain_name, only_strain) ids = [i.accession_id for i in only_strain] composed_string = f'{source_seq.accession_id} matches with {target_name} strain_only on {ids}.' else: only_strain = filter_most_exact_strain_match( source_seq.strain_name, only_strain) ids = [i.accession_id for i in only_strain] composed_string = f'{source_seq.accession_id} matches with {target_name} strain_only (possibly also country) on {ids}.' if len(only_strain) > 1: total_only_strain_1_to_n += 1 composed_string += f' WARNING: 1-N MATCH' elif len(only_strain) == 1: total_only_strain_1_to_1 += 1 if only_strain: output_record.append(composed_string) output_record.append( f'\tstrain values: GENBANK: {source_seq.strain_name} GISAID: {[y.strain_name for y in only_strain]}' ) if len(set([s.sequence_id for s in only_strain])) != len(only_strain): difference = len(only_strain) - len( set([s.accession_id for s in only_strain])) logger.error(f'mismatch of {difference}') for g in only_strain: gisaid_only_false_tuples += 1 all_target_ids_changed.add(g.accession_id) # UPDATE DEST TO SOURCE RELATION # if not dest_to_source_matches.get(g.accession_id): # dest_to_source_matches[g.accession_id] = { # 'GISAID_strain': g.strain_name, # 'GenBank-ids': [source_seq.accession_id], # 'GenBank-strains': [source_seq.strain_name] # } # else: # dest_elem = dest_to_source_matches[g.accession_id] # dest_elem['GenBank-ids'].append(source_seq.accession_id) # dest_elem['GenBank-strains'].append(source_seq.strain_name) insert_overlaps_in_db(source_session, target_session, source_seq, only_strain, source_name, target_name) logger.debug( f'total number of target sequences changed {len(all_target_ids_changed)}' ) if user_asked_to_commit: target_session.commit() source_session.commit() # set gisaid_only set_gisaid_only_based_on_overlap_table(target_session) if user_asked_to_commit: target_session.commit() except KeyboardInterrupt: rollback(source_session) rollback(target_session) output_record.append( "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!") except Exception as e: logger.exception("") rollback(source_session) rollback(target_session) output_record.append( "COMPUTATION INTERRUPTED. TOTALS MAY BE INCOMPLETE !!") finally: source_session.close() target_session.close() totals_string = f'TOTALS:\n' \ f'1-1 MATCHES: strain+length: {total_strain_plus_length_1_to_1} -- only strain {total_only_strain_1_to_1}\n' \ f'1-N MATCHES: strain+length: {total_strain_plus_length_1_to_n} -- only strain {total_only_strain_1_to_n} (search "WARN" to find \'em)\n' \ f'{target_name} tuples in which to set SEQUENCE.gisaid_only to False {gisaid_only_false_tuples}\n' logger.info(totals_string) output_path = f'.{sep}overlaps{sep}{source_name}_{target_name}{sep}' output_path += f'{source_name}@{source_db_name}_overlapping_{target_name}@{target_db_name}' output_path += f'_{date.today().strftime("%Y-%b-%d")}.txt' output_path = output_path.lower() with open(file=output_path, mode='w') as w: for line in output_record: w.write(line + "\n") w.write(totals_string) w.write('\n\n')