Esempio n. 1
0
    def collect_aa_variants_from_db(self, sample_acc_ids: list):
        all_db_annotations = dict()     # acc_id -> [(aa_var1), (aa_var_2), ...]

        def do(session):
            nonlocal all_db_annotations
            all_aa_variants = session \
                .query(Sequence.accession_id, Annotation.start, Annotation.stop, Annotation.gene_name,
                       Annotation.feature_type, Annotation.product,
                       AminoAcidVariant.start_aa_original, AminoAcidVariant.sequence_aa_original,
                       AminoAcidVariant.sequence_aa_alternative) \
                .select_from(Annotation, AminoAcidVariant, Sequence) \
                .filter(Annotation.annotation_id == AminoAcidVariant.annotation_id,
                        Annotation.sequence_id == Sequence.sequence_id,
                        Sequence.accession_id.in_(sample_acc_ids)) \
                .order_by(Sequence.accession_id, Annotation.start, AminoAcidVariant.start_aa_original) \
                .all()
            for acc_id, ann_start, ann_stop, gene_name, feature_type, product, \
                start_aa_original, sequence_aa_original,sequence_aa_alternative in all_aa_variants:
                # group aa_variants by accession id into a map
                try:
                    seq_annotations = all_db_annotations[acc_id]
                except KeyError:
                    seq_annotations = list()
                    all_db_annotations[acc_id] = seq_annotations
                content = (ann_start, ann_stop, gene_name, feature_type, product,
                           start_aa_original, sequence_aa_original, sequence_aa_alternative)
                seq_annotations.append(content)

        try_py_function(do)
        return all_db_annotations
Esempio n. 2
0
def generate_fasta(virus_taxon_id: int,
                   virus_folder_name: str,
                   generated_file_name: str,
                   only_null_lineages: bool = False) -> str:
    """
    Generates a multi fasta file containing all the sequences of the given virus.

    :return: the absolute path to the generated fasta file.
    """
    db_params: dict = import_config.get_database_config_params()
    database.config_db_engine(db_params["db_name"], db_params["db_user"],
                              db_params["db_psw"], db_params["db_port"])

    virus_db_id = virus_database_id(virus_taxon_id)

    if virus_db_id is None:
        raise Exception('Before running this algorithm, create the '
                        f'virus associated with taxon {virus_taxon_id}')

    def get_acc_ids_and_sequences_from_db(
            session: database.Session) -> Generator[Tuple, None, None]:
        query = session.query(database.Sequence.accession_id,
                                     database.NucleotideSequence.nucleotide_sequence)\
            .filter(database.Sequence.virus_id == virus_db_id,
                    database.Sequence.sequence_id == database.NucleotideSequence.sequence_id)
        if only_null_lineages:
            query = query.filter(database.Sequence.lineage == None)
        for pair in query.all():
            yield pair[0], pair[1]

    def get_total_acc_ids_from_db(session: database.Session) -> int:
        query = session.query(func.count(database.Sequence.accession_id)) \
            .filter(database.Sequence.virus_id == virus_db_id)
        if only_null_lineages:
            query = query.filter(database.Sequence.lineage == None)
        return query.first()[0]

    target_file_path = get_local_folder_for(
        virus_folder_name, FileType.Fasta) + generated_file_name
    logger.info(f"Generating fasta...")
    with open(file=target_file_path, mode='w') as file:
        total_count = database.try_py_function(get_total_acc_ids_from_db)
        print(total_count)
        data = database.try_py_function(get_acc_ids_and_sequences_from_db)
        progress = tqdm(total=total_count)
        if total_count > 0:
            first = next(data)
            file.write(f'>{first[0]}\n')
            file.write(first[1])
            progress.update()
        for acc_id_and_sequences in data:
            file.write(f'\n>{acc_id_and_sequences[0]}\n')
            file.write(acc_id_and_sequences[1])
            progress.update()
    target_file_path = abspath(target_file_path)
    logger.info(f"Fasta file generated at {target_file_path}")
    return target_file_path
Esempio n. 3
0
 def _measure_delta_in_db(self,
                          virus_db_id: int,
                          is_primary_acc_id: bool,
                          sources: Optional[List[str]] = None):
     self._is_primary_acc_id = is_primary_acc_id
     self._virus_db_id = virus_db_id
     self._sources = sources
     if is_primary_acc_id:
         _sequences_in_db_at_start = database.try_py_function(
             vcm.sequence_primary_accession_ids, virus_db_id, sources)
     else:
         _sequences_in_db_at_start = database.try_py_function(
             vcm.sequence_alternative_accession_ids, virus_db_id, sources)
     self._sequences_in_db_at_start = Counter(_sequences_in_db_at_start)
Esempio n. 4
0
def virus_database_id(virus_taxon_id) -> Optional[int]:
    class Virus:
        @staticmethod
        def taxon_id():
            return virus_taxon_id

    def get_virus_db_id(session) -> Optional[int]:
        db_virus = vcm.get_virus(session, Virus())
        return db_virus.virus_id if db_virus else None

    return database.try_py_function(get_virus_db_id)
Esempio n. 5
0
    def get_sequences_in_current_data(self) -> dict:
        def do(session):
            all_sequences = dict()
            for db_items in session.query(Sequence, HostSample, SequencingProject) \
                                      .filter(Sequence.strain_name.isnot(None),
                                              Sequence.host_sample_id == HostSample.host_sample_id,
                                              Sequence.sequencing_project_id == SequencingProject.sequencing_project_id) \
                                      .yield_per(100):
                source_seq, source_host, source_prj = db_items
                all_sequences[source_seq.accession_id] = (source_seq.strain_name, int(source_seq.length),
                                                          source_seq.gc_percentage, source_seq.n_percentage,
                                                          source_host.collection_date, source_host.originating_lab,
                                                          str(source_prj.submission_date) if source_prj.submission_date is not None else None,
                                                          source_prj.sequencing_lab,
                                                          source_host.country, source_host.region,
                                                          source_host.isolation_source)
            return all_sequences

        return try_py_function(do)
Esempio n. 6
0
    def get_sequences_in_current_data() -> dict:
        def do(session):
            all_sequences = dict()
            for db_items in session.query(Sequence, HostSample, NucleotideSequence) \
                    .select_from(Sequence, HostSample, SequencingProject, NucleotideSequence) \
                    .filter(Sequence.strain_name.isnot(None),
                            Sequence.host_sample_id == HostSample.host_sample_id,
                            Sequence.sequence_id == NucleotideSequence.sequence_id,
                            Sequence.sequencing_project_id == SequencingProject.sequencing_project_id,
                            SequencingProject.database_source == 'COG-UK') \
                    .yield_per(100):
                source_seq, source_host, nuc_seq_db_obj = db_items
                all_sequences[source_seq.accession_id] = (
                    source_seq.length, source_seq.gc_percentage,
                    source_seq.n_percentage, source_host.collection_date,
                    source_host.country, source_host.region,
                    nuc_seq_db_obj.nucleotide_sequence)
            return all_sequences

        return try_py_function(do)
Esempio n. 7
0
def import_epitopes(virus_taxon_id: int):
    # remember to update the data source
    logger.warning(
        "Keep in mind to update the data source with 'python main.py download epitopes' before performing"
        " the current action. This program will resume in 10 seconds.")
    try:
        sleep(10)
    except KeyboardInterrupt:
        return

    # begin
    db_params: dict = import_config.get_database_config_params()
    database.config_db_engine(db_params["db_name"], db_params["db_user"],
                              db_params["db_psw"], db_params["db_port"])
    if virus_taxon_id in [2010960, 186539]:  # == bombali or reston ebolavirus
        logger.info(f'No epitopes available for virus {virus_taxon_id}.')
        return

    virus_db_id = virus_database_id(virus_taxon_id)

    if virus_db_id is None:
        raise Exception(
            'Epitopes must be associated to a Virus DB entity. Before running epitopes, create the '
            f'virus associated with taxon {virus_taxon_id}')
    if epitopes_already_imported(virus_db_id):
        logger.info(
            'Epitopes for this virus are already imported into the DB. Aborting import.'
        )
        return

    # run epitopes' code
    logger.debug(
        f'calling epitopes for virus taxon {virus_taxon_id} associated to DB virud_id {virus_db_id}'
    )
    epitopes, fragments = epitopes_for_virus_taxon(virus_taxon_id)

    # write to file
    # epitopes_file = open(f'.{sep}epitopes.csv', mode='w')
    # epitopes_file.write('virus_db_id\thost_specie_db_id\thost_name\thost_iri\tprotein_ncbi_id\tcell_type\tmhc_class\t'
    #                     'mhc_allele\tresponse_frequency_positive\tassay_type\tseq\tstart\tstop\text_links\t'
    #                     'prediction_process\tis_linear\tepitope_iri\tiedb_epitope_id\n')
    # epitopes_fragm_file = open(f'.{sep}epitopes_fragments.csv', mode='w')
    # epitopes_fragm_file.write('damianos_epitope_id\tseq\tstart\tstop\n')

    def do(session: database.Session):
        global epitope_id_mappings
        try:
            for epitope in epitopes:
                # get contained values
                damianos_epitope_id, virus_taxon_id, host_iri, host_name, host_taxon_id, protein_ncbi_id, cell_type, \
                mhc_class, mhc_allele, response_frequency_positive, assay_type, seq, start, stop, ext_links, \
                prediction_process, is_linear, epitope_iri, iedb_epitope_id = epitope

                # put host specie foreign key
                host_specie_db_id = create_or_get_host_specie_db_id(
                    session, host_taxon_id)

                # insert epitope in the DB
                epitope = (virus_db_id, host_specie_db_id, host_name, host_iri,
                           protein_ncbi_id, cell_type, mhc_class, mhc_allele,
                           response_frequency_positive, assay_type, seq, start,
                           stop, ext_links, prediction_process, is_linear,
                           epitope_iri, iedb_epitope_id)

                # write to file
                # types = (str(type(i)) for i in epitope)
                # items = (str(i) for i in epitope)
                # for i in zip(items, types):
                #     epitopes_file.write(f'{i[0], i[1]}\t')
                # epitopes_file.write('\n')

                epitope_db_id = vcm.create_epitope(session, epitope)
                # bind epitope ids from Damianos with the ones returned from database
                epitope_id_mappings[damianos_epitope_id] = epitope_db_id

            for fragment in fragments:
                _, damianos_epitope_id, seq, start, stop = fragment

                # bind epitope ids from Damianos with the ones returned from database
                try:
                    epitope_db_id = epitope_id_mappings[damianos_epitope_id]
                except KeyError as e:
                    raise KeyError(
                        f'the epitope fragment ID {damianos_epitope_id} does not appear in the epitope IDs. This epitope fragment'
                        f' will be not inserted into the DB.')

                fragment = (epitope_db_id, seq, start, stop)

                # write to file
                # types = (str(type(i)) for i in fragment)
                # items = (str(i) for i in fragment)
                # for i in zip(items, types):
                #     epitopes_fragm_file.write(f'{i[0], i[1]}\t')
                # epitopes_fragm_file.write('\n')

                vcm.create_epitope_fragment(session, fragment)
            vcm.DBCache.commit_changes()
        except Exception as e:
            logger.exception(
                'Exception occurred while computing and importing epitopes. Epitopes won\'t be inserted into the DB.'
            )
            vcm.DBCache.rollback_changes()
            raise database.RollbackAndRaise(e)
        # finally:
        # epitopes_file.close()
        # epitopes_fragm_file.close()

    database.try_py_function(do)

    # insert one row for each linear epitope into epitope_fragment table
    database.run_script(
        f".{sep}sql_scripts{sep}insert_linear_epitopes_into_epi_fragments.sql")
Esempio n. 8
0
def epitopes_already_imported(virus_db_id):
    return database.try_py_function(vcm.check_existence_epitopes, virus_db_id)
Esempio n. 9
0
def import_samples_into_vcm():
    global fasta_list, refseq_sc1, refseq_sc2, refseq_sc1_len, refseq_sc2_len, cached_taxonomy, fasta_folder, \
        taxonomy_folder, imported_viruses
    db_params: dict = import_config.get_database_config_params()
    database.config_db_engine(db_params["db_name"], db_params["db_user"],
                              db_params["db_psw"], db_params["db_port"])
    fasta_folder = get_local_folder_for('NMDC', FileType.SequenceOrSampleData)
    taxonomy_folder = get_local_folder_for('NMDC', FileType.TaxonomyData)
    fasta_list = get_fasta_list()
    stats_module.schedule_samples(set_info=stats_module.StatsBasedOnIds(
        [x.rstrip('.fasta') for x in fasta_list], True))
    logger.warning(
        f'{len(fasta_list)} files found at {base_url}. Some of them may be skipped because they have not metadata'
        f' or because they are not realted to SARS or SARS-coV2.')
    logger.info(f'downloading fasta files')
    download_fastas()

    # REFRENCE SEQUENCE
    refseq_sc2 = reference_sequence(
        "(txid2697049[Organism]) AND srcdb_refseq[Properties]")
    refseq_sc2_len = len(refseq_sc2)
    refseq_sc1 = reference_sequence(
        "txid694009[Organism:exp] NOT txid2697049[Organism] AND srcdb_refseq[Properties]"
    )
    refseq_sc1_len = len(refseq_sc1)

    def virus_taxonomy_pipeline(session: database, taxon: AnyNCBITaxon):
        return vcm.create_or_get_virus(session, taxon)

    # noinspection PyTypeChecker
    def metadata_pipeline(session: database.Session,
                          a_sample: NMDCVirusSample):
        try:
            experiment_id = vcm.create_or_get_experiment(session, a_sample)
            host_specie_id = vcm.create_or_get_host_specie(session, a_sample)
            host_sample_id = vcm.create_or_get_host_sample(
                session, a_sample, host_specie_id)
            sequencing_project_id = vcm.create_or_get_sequencing_project(
                session, a_sample)
            sequence, nucleotide_seq = vcm.create_and_get_sequence(
                session, a_sample, virus_id, experiment_id, host_sample_id,
                sequencing_project_id)
            vcm.DBCache.commit_changes()
            return sequence.sequence_id
        except Exception as e:
            if str(e).startswith(
                    'duplicate key value violates unique constraint "sequence_accession_id_key"'
            ):
                logger.error(
                    f'exception occurred while working on virus sample {a_sample}: {str(e)}'
                )
            else:
                logger.exception(
                    f'exception occurred while working on virus sample {a_sample}'
                )
            vcm.DBCache.rollback_changes()
            raise database.Rollback()

    def nucleotide__annotations__pipeline(session: database.Session,
                                          a_sample: NMDCVirusSample,
                                          db_sequence_id):
        if a_sample.taxon_name(
        ) == 'Severe acute respiratory syndrome coronavirus 2':
            refseq = refseq_sc2
        elif a_sample.taxon_name() == 'Bat SARS-related coronavirus':
            refseq = refseq_sc1
        else:
            raise Exception(f'unknown taxon organism {a_sample.taxon_name()}')
        try:
            file_path = get_local_folder_for(
                'NMDC', FileType.Annotations) + str(
                    sample.primary_accession_number()) + ".pickle"
            if not os.path.exists(file_path):
                annotations_and_nuc_variants = sequence_aligner(
                    db_sequence_id, refseq, a_sample.nucleotide_sequence(),
                    sc2_chr_name, sc2_ann_file_path, sc2_snpeff_db_name)
                with open(file_path, mode='wb') as cache_file:
                    pickle.dump(annotations_and_nuc_variants,
                                cache_file,
                                protocol=pickle.HIGHEST_PROTOCOL)
            else:
                with open(file_path, mode='rb') as cache_file:
                    annotations_and_nuc_variants = pickle.load(cache_file)
            annotations, nuc_variants = annotations_and_nuc_variants
            for ann in annotations:
                vcm.create_annotation_and_amino_acid_variants(
                    session, db_sequence_id, *ann)
            for nuc in nuc_variants:
                vcm.create_nuc_variants_and_impacts(session, db_sequence_id,
                                                    nuc)
            stats_module.completed_sample(sample.primary_accession_number())
        except Exception:
            logger.exception(
                f'exception occurred while working on annotations and nuc_variants of virus sample '
                f'{a_sample.primary_accession_number()}. Rollback transaction.'
            )
            raise database.Rollback()

    # create pipeline_event (will be inserted later)
    pipeline_event = database.PipelineEvent(
        event_date=datetime.now().strftime("%Y-%m-%d"),
        event_name=f'NMDC sars_cov_2 sequences update',
        removed_items=0,
        changed_items=0)

    logger.info('begin import of selected records')
    vcm.DBCache.commit_changes()
    total_sequences_imported = 0
    total_sequences_skipped = 0
    log_of_gisaid_id_path = f"{get_local_folder_for('NMDC', FileType.Logs)}{os.path.sep}gisa_ids.txt"
    with open(log_of_gisaid_id_path, mode='w') as log_of_gisaid_id:
        for file in tqdm(fasta_list):
            try:
                sample = NMDCVirusSample(file)

                # filter samples by organism
                organism_name = sample.taxon_name()
                if organism_name != 'Severe acute respiratory syndrome coronavirus 2':
                    logger.info(
                        f'Sample {file} skipped because related to organims {organism_name}'
                    )
                    total_sequences_skipped += 1
                    continue
                # download taxonomy for new organisms
                organism = cached_taxonomy.get('organism_name')
                if not organism:
                    organism_file = download_ncbi_taxonomy_as_xml_from_name(
                        taxonomy_folder, organism_name)
                    organism = AnyNCBITaxon(organism_file)
                    cached_taxonomy[organism_name] = organism
            except FileNotFoundError:
                logger.error(f'Sample {file} skipped')
                total_sequences_skipped += 1
                continue
            except AssertionError:
                logger.exception(f'Sample {file} skipped')
                total_sequences_skipped += 1
                continue

            # virus id associated to this sample
            virus_id = database.try_py_function(virus_taxonomy_pipeline,
                                                organism)
            if virus_id not in imported_viruses:
                imported_viruses.add(virus_id)
                database.try_py_function(vcm.update_db_metadata, virus_id,
                                         'NMDC')
                vcm.DBCache.commit_changes()
            if virus_id:
                gisa_id = sample.gisa_id()
                if gisa_id:
                    log_of_gisaid_id.write(gisa_id + '\n')

                # import sample
                sequence_id = database.try_py_function(metadata_pipeline,
                                                       sample)
                vcm.DBCache.commit_changes()
                if sequence_id:
                    database.try_py_function(nucleotide__annotations__pipeline,
                                             sample, sequence_id)
                total_sequences_imported += 1

        logger.info(f'{total_sequences_imported} sequences imported.')
        logger.info(f'{total_sequences_skipped} sequences skipped.')
        if total_sequences_skipped > 100:
            send_message(
                f"NMDC importer can have a bug. {total_sequences_skipped} out of "
                f"{total_sequences_skipped+total_sequences_imported} were not imported."
            )

        logger.info(f'list of sequences with GISAID references at path: ' +
                    log_of_gisaid_id_path)

    pipeline_event.added_items = total_sequences_imported
    database.try_py_function(vcm.insert_data_update_pipeline_event,
                             pipeline_event)
Esempio n. 10
0
    def check_samples_imported(self):
        # overwrite Value object with int for a simpler handling
        self._number_completed_samples = self._number_completed_samples.value
        self._number_removed_samples = self._number_removed_samples.value

        message = f"\n" \
                  f"STATS MODULE:\n" \
                  f"Scheduled import of {self._number_scheduled_samples} samples\n" \
                  f"Removed {self._number_removed_samples} samples\n" \
                  f"Completed {self._number_completed_samples} samples\n"
        warn = False

        # find errors
        scheduled_not_completed = self._number_scheduled_samples - self._number_completed_samples
        completed_not_scheduled = self._number_completed_samples - self._number_scheduled_samples
        if scheduled_not_completed > 0:
            warn = True
            message += f'Failed (scheduled but not completed) {scheduled_not_completed} samples\n'
        if completed_not_scheduled > 0:
            warn = True
            message += f'Wrongly processed (completed but not scheduled) {completed_not_scheduled} samples\n'

        # check against DB
        if self._virus_db_id is not None:
            current_sequences_in_db = database.try_py_function(
                vcm.sequence_primary_accession_ids, self._virus_db_id,
                self._sources)
            current_sequences_in_db = Counter(current_sequences_in_db)
            inserted_in_db = current_sequences_in_db - self._sequences_in_db_at_start
            removed_from_db = self._sequences_in_db_at_start - current_sequences_in_db
            message += f"Sequences in DB:\n" \
                       f"\tprevious number {sum(self._sequences_in_db_at_start.values())}.\n" \
                       f"\tcurrent number {sum(current_sequences_in_db.values())}\n" \
                       f"\tdifference: {sum(inserted_in_db.values())} new - {sum(removed_from_db.values())} missing from source and deleted = {sum(inserted_in_db.values()) - sum(removed_from_db.values())}\n"
            # check duplicates
            num_duplcates_at_start = sum(
                self._sequences_in_db_at_start.values()) - len(
                    set(self._sequences_in_db_at_start))
            num_current_duplicates = sum(
                current_sequences_in_db.values()) - len(
                    set(current_sequences_in_db))
            if num_duplcates_at_start > 0 or num_current_duplicates > 0:
                warn = True
                message += f"Duplicated accession_ids in DB:\n" \
                           f"\tprevious number of duplicates: {num_duplcates_at_start}\n" \
                           f"\tcurrent number of duplicates: {num_current_duplicates}\n" \
                           f"\tdetail of current duplicated accession_ids: {sorted(list(current_sequences_in_db - Counter(set(current_sequences_in_db))))}\n"

            num_completed_not_inserted = self._number_completed_samples - sum(
                inserted_in_db.values()) - self._number_removed_samples
            if num_completed_not_inserted > 0:
                warn = True
                message += f'Number of samples completed and not imported (or imported twice) into the DB: {num_completed_not_inserted}\n'
            elif num_completed_not_inserted < 0:
                warn = True
                message += f'Number of samples imported into the DB but incomplete: {-num_completed_not_inserted}\n'

            # performance
            message += self._performance_message(
                self._number_completed_samples)

        if not warn:
            logger.info(message)
        else:
            logger.warning(message)
Esempio n. 11
0
    def check_samples_imported(self):
        if self._completed_samples_acc_id is None:
            logger.error(
                'STATS MODULE: check_samples_imported called before add_samples. Stats cannot be produced.'
            )
        else:
            warn = False
            message = f"\n" \
                      f"STATS MODULE:\n"
            _scheduled_samples_acc_id = set(self._scheduled_samples_acc_id)
            message += f"Scheduled import of {len(_scheduled_samples_acc_id)} samples\n"
            completed_samples_acc_id = self._queue_to_set(
                self._completed_samples_acc_id)
            removed_samples_acc_id = self._queue_to_set(
                self._removed_samples_acc_id)
            self._completed_samples_acc_id.close()
            self._removed_samples_acc_id.close()
            message += f"Removed {len(removed_samples_acc_id)} samples\n"
            message += f"Completed {len(completed_samples_acc_id)} samples\n"

            # find errors
            scheduled_not_completed = _scheduled_samples_acc_id - completed_samples_acc_id
            completed_not_scheduled = completed_samples_acc_id - _scheduled_samples_acc_id
            if len(scheduled_not_completed) > 0:
                warn = True
                message += f'Failed (scheduled but not completed) {len(scheduled_not_completed)} samples\n' \
                           f'\tsamples id: {scheduled_not_completed}\n'
            if len(completed_not_scheduled) > 0:
                warn = True
                message += f'Wrongly processed (completed but not scheduled) {len(completed_not_scheduled)} samples\n' \
                           f'\tsamples id: {completed_not_scheduled}\n'

            # check against DB
            if self._virus_db_id is not None:
                if self._is_primary_acc_id:
                    current_sequences_in_db = database.try_py_function(
                        vcm.sequence_primary_accession_ids, self._virus_db_id,
                        self._sources)
                else:
                    current_sequences_in_db = database.try_py_function(
                        vcm.sequence_alternative_accession_ids,
                        self._virus_db_id, self._sources)
                current_sequences_in_db = Counter(current_sequences_in_db)
                inserted_in_db = current_sequences_in_db - self._sequences_in_db_at_start
                removed_from_db = self._sequences_in_db_at_start - current_sequences_in_db
                message += f"Sequences in DB:\n" \
                           f"\tprevious number {sum(self._sequences_in_db_at_start.values())}.\n" \
                           f"\tcurrent number {sum(current_sequences_in_db.values())}\n" \
                           f"\tdifference: {sum(inserted_in_db.values())} new - {sum(removed_from_db.values())} missing from source and deleted  = {sum(inserted_in_db.values()) - sum(removed_from_db.values())}\n"
                # check duplicates
                num_duplicates_at_start = sum(
                    self._sequences_in_db_at_start.values()) - len(
                        set(self._sequences_in_db_at_start))
                num_current_duplicates = sum(
                    current_sequences_in_db.values()) - len(
                        set(current_sequences_in_db))
                if num_duplicates_at_start > 0 or num_current_duplicates > 0:
                    warn = True
                    message += f"Duplicated accession_ids in DB:\n" \
                               f"\tprevious number of duplicates: {num_duplicates_at_start}\n" \
                               f"\tcurrent number of duplicates: {num_current_duplicates}\n" \
                               f"\tdetail of current duplicated accession_ids: {sorted(list(current_sequences_in_db - Counter(set(current_sequences_in_db))))}\n"

                # check errors in insertions
                changed_in_db = Counter(
                    removed_samples_acc_id
                )  # this set cannot be retrieved from the database
                completed_not_inserted = Counter(
                    completed_samples_acc_id) - inserted_in_db - changed_in_db
                inserted_not_completed = inserted_in_db - Counter(
                    completed_samples_acc_id) - changed_in_db
                if sum(completed_not_inserted.values()) > 0:
                    warn = True
                    message += f'Number of samples completed and not imported into the DB: {sum(completed_not_inserted.values())}\n' \
                               f'\taccession ids: {sorted(list(completed_not_inserted.elements()))}\n'
                if sum(inserted_not_completed.values()) < 0:
                    warn = True
                    message += f'Number of samples imported into the DB but incomplete: {sum(inserted_not_completed.values())}\n' \
                               f'\taccession ids: {sorted(list(inserted_not_completed.elements()))}\n'

            # performance
            message += self._performance_message(len(completed_samples_acc_id))

            if not warn:
                logger.info(message)
            else:
                logger.warning(message)
Esempio n. 12
0
def run(from_sample: Optional[int] = None, to_sample: Optional[int] = None):
    global virus, virus_id, import_method
    db_params: dict = import_config.get_database_config_params()
    database.config_db_engine(db_params["db_name"], db_params["db_user"],
                              db_params["db_psw"], db_params["db_port"])
    virus = COGUKSarsCov2()
    # IMPORT VIRUS TAXON DATA
    virus_id = database.try_py_function(vcm.create_or_get_virus, virus)

    # update last import date
    database.try_py_function(vcm.update_db_metadata, virus_id, 'COG-UK')

    # find outdated and new samples from source (some sequences can be updated, so the sets are not necessarily disjoint)
    logger.warning(
        "Current implementation of deltas for COG-UK uses more than 10 GB of RAM to cache query results and save time.\n"
        "IF YOUR SYSTEM CAN'T PROVIDE MORE THAN 10 GB OF RAM, STOP THE PROCESS NOW.\n"
        "The program will resume in 15 seconds")
    try:
        sleep(15)
    except KeyboardInterrupt:
        return
    id_outdated_sequences, id_new_sequences = virus.deltas()
    logger.warning('Check deltas.. The program will resume in 30 seconds.')
    try:
        sleep(30)
    except KeyboardInterrupt:
        return

    # select range
    if from_sample is not None and to_sample is not None:
        id_new_sequences = {
            id_new_sequences.pop()
            for i in range(to_sample - from_sample)
        }

    # create pipeline_event (will be inserted later)
    pipeline_event = database.PipelineEvent(
        event_date=datetime.now().strftime("%Y-%m-%d"),
        event_name=f'COGUK sars_cov_2 sequences update',
        removed_items=len(id_outdated_sequences),
        changed_items=0,
        added_items=len(
            id_new_sequences
        ),  # may eventually change if some sequence are not imported
    )

    # initialize statistics module
    stats_module.schedule_samples(
        stats_module.StatsBasedOnIds(id_new_sequences, True, virus_id,
                                     ['COG-UK']))

    # remove outdated sequences
    logger.info(f'removing outdated sequences')
    database.try_py_function(
        vcm.remove_sequence_and_meta_list,
        primary_sequence_accession_id=id_outdated_sequences)
    stats_module.removed_samples(id_outdated_sequences)
    for _id in id_outdated_sequences:
        file_path = get_local_folder_for(
            virus.name, FileType.Annotations) + str(_id).replace(
                '/', '-') + ".pickle"
        remove_file(file_path)

    # prepare multiprocessing
    logger.info(f'importing virus sequences and related tables')
    import_method = Parallel()

    vcm.DBCache.commit_changes()
    for s in virus.get_sequences_of_updated_source(
            filter_accession_ids=id_new_sequences):
        if not s.nucleotide_sequence():
            logger.info(
                f'sample {s.primary_accession_number()} skipped because nucleotide sequence is empty or null'
            )
            continue
        try:
            database.try_py_function(import_method.import_virus_sample, s)
            vcm.DBCache.commit_changes()
        except:
            logger.exception(
                f'exception occurred while working on virus sample {s.primary_accession_number()}'
            )
            vcm.DBCache.rollback_changes()

    logger.info('main process completed')
    import_method.tear_down()

    # remove leftovers of failed samples
    try:
        metadata_samples_to_remove: set = stats_module.get_scheduled_not_completed(
        )
        if len(metadata_samples_to_remove) > 1100:
            send_message(
                f"COGUK importer can have a bug. {len(metadata_samples_to_remove)} out of "
                f"{len(id_new_sequences)} failed.")
        pipeline_event.added_items = pipeline_event.added_items - len(
            metadata_samples_to_remove)
        if len(metadata_samples_to_remove) > 0:
            logger.info(
                f"Removing metadata leftovers of imports that failed during variant/annotation calling or metadata"
                f" ({len(metadata_samples_to_remove)} samples)")

            metadata_samples_to_remove_as_string: list = [
                str(x) for x in metadata_samples_to_remove
            ]
            logger.trace("Accession id of failed imports:\n"
                         f"{metadata_samples_to_remove_as_string}")
            logger.info("Deleting leftovers in database")
            database.try_py_function(vcm.remove_sequence_and_meta_list,
                                     primary_sequence_accession_id=
                                     metadata_samples_to_remove_as_string)
    except:
        logger.exception(
            "Removal of metadata leftovers in the DB of the samples that failed was not successful."
        )

    database.try_py_function(vcm.insert_data_update_pipeline_event,
                             pipeline_event)
Esempio n. 13
0
def run(from_sample: Optional[int] = None, to_sample: Optional[int] = None):
    db_params: dict = import_config.get_database_config_params()
    database.config_db_engine(db_params["db_name"], db_params["db_user"],
                              db_params["db_psw"], db_params["db_port"])

    # IMPORT VIRUS TAXON DATA
    virus = GISAIDSarsCov2()
    virus_id = database.try_py_function(vcm.create_or_get_virus, virus)
    database.try_py_function(vcm.update_db_metadata, virus_id, 'GISAID')

    # COMPUTE DELTAS
    acc_ids_sequences_to_remove, acc_id_sequences_to_import, sequences_to_update = virus.deltas(
    )
    acc_ids_sequences_to_update = sequences_to_update.keys()
    logger.warning('Check deltas.. The program will resume in 30 seconds.')
    sleep(30)

    # MIND FROM_SAMPLE/TO_SAMPLE
    if from_sample is not None and to_sample is not None:
        count_el_to_import = abs(to_sample - from_sample)
        count_el_to_import = min(count_el_to_import,
                                 len(acc_id_sequences_to_import))
        count_el_to_ignore = len(
            acc_id_sequences_to_import) - count_el_to_import
        try:
            for i in range(count_el_to_ignore):
                acc_id_sequences_to_import.pop()
        except KeyError:
            pass  # if pop on empty set

    # create pipeline_event (will be inserted later)
    pipeline_event = database.PipelineEvent(
        event_date=datetime.now().strftime("%Y-%m-%d"),
        event_name=f'GISAID sars_cov_2 sequences update',
        removed_items=len(acc_ids_sequences_to_remove),
        changed_items=len(
            acc_ids_sequences_to_update
        ),  # provisional - (sqlalchemy wants a value at obj creation)
        added_items=len(
            acc_id_sequences_to_import
        )  # provisional - (sqlalchemy wants a value at obj creation)
    )
    changed_items = 0
    added_items = 0

    stats_module.schedule_samples(
        stats_module.StatsBasedOnIds(acc_id_sequences_to_import, True,
                                     virus_id, ['GISAID']))

    logger.info('Removing outdated sequences...')
    # REMOVE OUTDATED SEQUENCES
    database.try_py_function(vcm.remove_sequence_and_meta_list,
                             acc_ids_sequences_to_remove, None)
    stats_module.removed_samples(acc_ids_sequences_to_remove)

    # IMPORT NEW/CHANGED SEQUENCES
    vcm.DBCache.commit_changes()
    logger.info(f'Importing virus sequences and related tables...')
    import_method = Sequential(virus_id)

    progress = tqdm(total=len(acc_id_sequences_to_import) +
                    len(acc_ids_sequences_to_update))
    for sample in virus.get_sequences_of_updated_source():
        sample_accession_id = sample.primary_accession_number()
        try:
            if sample_accession_id in acc_id_sequences_to_import:
                # import sample from scratch
                database.try_py_function(import_method.import_virus_sample,
                                         sample)
                vcm.DBCache.commit_changes()
                progress.update()
                added_items += 1
            elif sample_accession_id in acc_ids_sequences_to_update:
                # update values inside the database
                changes_in_sequence = sequences_to_update[sample_accession_id]
                database.try_py_function(import_method.update_virus_sample,
                                         sample, changes_in_sequence)
                vcm.DBCache.commit_changes()
                progress.update()
                changed_items += 1
        except KeyboardInterrupt:
            logger.info("main loop interrupted by the user")
            break
        except:
            logger.exception(
                f'exception occurred while working on virus sample {sample.internal_id()}'
            )
            vcm.DBCache.rollback_changes()

    logger.info('main loop completed')
    import_method.tear_down()

    logger.info('Removal of unused database objects...')
    database.try_py_function(vcm.clean_objects_unreachable_from_sequences)

    if len(acc_id_sequences_to_import) - added_items > 100:
        send_message(
            f"GISAID importer can have a bug. import of {len(acc_id_sequences_to_import) - added_items} out of"
            f" {len(acc_id_sequences_to_import)} failed.")

    pipeline_event.changed_items = changed_items
    pipeline_event.added_items = added_items
    database.try_py_function(vcm.insert_data_update_pipeline_event,
                             pipeline_event)