Beispiel #1
0
    def test_parse_genome_data_4(self):
        """Verify that multiple Genome objects with CDS features
        are constructed correctly for multiple valid PhageIDs."""
        genome_list = mysqldb.parse_genome_data(self.engine,
                                                phage_query=self.phage_query,
                                                gene_query=self.gene_query)

        genome_dict = {}
        for gnm in genome_list:
            genome_dict[gnm.id] = gnm

        with self.subTest():
            self.assertEqual(len(genome_list), 3)
        with self.subTest():
            self.assertEqual(genome_dict["Trixie"].seq, "AATT")
        with self.subTest():
            self.assertEqual(len(genome_dict["Trixie"].cds_features), 3)
        with self.subTest():
            self.assertEqual(
                genome_dict["Trixie"].cds_features[0].genome_length, 4)
        with self.subTest():
            self.assertEqual(
                genome_dict["Trixie"].cds_features[1].genome_length, 4)
        with self.subTest():
            self.assertEqual(len(genome_dict["D29"].cds_features), 1)
        with self.subTest():
            self.assertEqual(genome_dict["D29"].cds_features[0].genome_length,
                             5)
        with self.subTest():
            self.assertEqual(len(genome_dict["L5"].cds_features), 0)
Beispiel #2
0
 def test_parse_genome_data_2(self):
     """Verify that an empty Genome object list is constructed for an
     invalid PhageID."""
     genome_list = mysqldb.parse_genome_data(self.engine,
                                             phage_id_list=["EagleEye"],
                                             phage_query=self.phage_query)
     self.assertEqual(len(genome_list), 0)
Beispiel #3
0
 def test_parse_genome_data_3(self):
     """Verify that a Genome object with CDS, tRNA, and tmRNA features
     is constructed correctly for a valid PhageID."""
     genome_list = mysqldb.parse_genome_data(self.engine,
                                             phage_id_list=["Trixie"],
                                             phage_query=PHAGE_QUERY,
                                             gene_query=GENE_QUERY,
                                             trna_query=TRNA_QUERY,
                                             tmrna_query=TMRNA_QUERY)
     with self.subTest():
         self.assertEqual(len(genome_list), 1)
     with self.subTest():
         self.assertEqual(genome_list[0].id, "Trixie")
     with self.subTest():
         self.assertEqual(genome_list[0].seq, "AATT")
     with self.subTest():
         self.assertEqual(genome_list[0].type, "")
     with self.subTest():
         self.assertEqual(genome_list[0].date, constants.EMPTY_DATE)
     with self.subTest():
         self.assertEqual(len(genome_list[0].cds_features), 3)
     with self.subTest():
         self.assertEqual(genome_list[0].cds_features[0].genome_length, 4)
     with self.subTest():
         self.assertEqual(len(genome_list[0].trna_features), 2)
     with self.subTest():
         self.assertEqual(genome_list[0].trna_features[0].genome_length, 4)
     with self.subTest():
         self.assertEqual(len(genome_list[0].tmrna_features), 1)
     with self.subTest():
         self.assertEqual(genome_list[0].tmrna_features[0].genome_length, 4)
Beispiel #4
0
def get_genome_seqrecords(alchemist, values=[], verbose=False):
    genomes = mysqldb.parse_genome_data(alchemist.engine,
                                        phage_id_list=values,
                                        phage_query=PHAGE_QUERY,
                                        gene_query=GENE_QUERY)

    seqrecords = []
    for gnm in genomes:
        process_cds_features(gnm)
        if verbose:
            print(f"Converting {gnm.name}...")
        seqrecords.append(flat_files.genome_to_seqrecord(gnm))

    return seqrecords
Beispiel #5
0
    def test_parse_genome_data_4(self):
        """Verify that multiple Genome objects with CDS, tRNA, and tmRNA features
        are constructed correctly for multiple valid PhageIDs."""
        genome_list = mysqldb.parse_genome_data(self.engine,
                                                phage_query=PHAGE_QUERY,
                                                gene_query=GENE_QUERY,
                                                trna_query=TRNA_QUERY,
                                                tmrna_query=TMRNA_QUERY)

        genome_dict = {}
        for gnm in genome_list:
            genome_dict[gnm.id] = gnm

        with self.subTest():
            self.assertEqual(len(genome_list), 3)
        with self.subTest():
            self.assertEqual(genome_dict["Trixie"].seq, "AATT")
        with self.subTest():
            self.assertEqual(len(genome_dict["Trixie"].cds_features), 3)
        with self.subTest():
            self.assertEqual(
                genome_dict["Trixie"].cds_features[0].genome_length, 4)
        with self.subTest():
            self.assertEqual(
                genome_dict["Trixie"].cds_features[1].genome_length, 4)
        with self.subTest():
            self.assertEqual(len(genome_dict["Trixie"].trna_features), 2)
        with self.subTest():
            self.assertEqual(len(genome_dict["Trixie"].tmrna_features), 1)
        with self.subTest():
            self.assertEqual(len(genome_dict["D29"].cds_features), 1)
        with self.subTest():
            self.assertEqual(genome_dict["D29"].cds_features[0].genome_length,
                             5)
        with self.subTest():
            self.assertEqual(len(genome_dict["D29"].trna_features), 1)
        with self.subTest():
            self.assertEqual(genome_dict["D29"].trna_features[0].id, "D29_1")
        with self.subTest():
            self.assertEqual(len(genome_dict["D29"].tmrna_features), 0)
        with self.subTest():
            self.assertEqual(len(genome_dict["L5"].cds_features), 0)
        with self.subTest():
            self.assertEqual(len(genome_dict["L5"].trna_features), 0)
        with self.subTest():
            self.assertEqual(len(genome_dict["L5"].tmrna_features), 1)
        with self.subTest():
            self.assertEqual(genome_dict["L5"].tmrna_features[0].id, "L5_1")
Beispiel #6
0
def build_id_record_map(alchemist, phageids):
    id_record_map = {}
    if not phageids:
        return id_record_map

    genomes = mysqldb.parse_genome_data(alchemist.engine,
                                        phage_id_list=phageids,
                                        phage_query=PHAGE_QUERY,
                                        gene_query=GENE_QUERY,
                                        trna_query=TRNA_QUERY,
                                        tmrna_query=TMRNA_QUERY)

    for genome in genomes:
        record = flat_files.genome_to_seqrecord(genome)
        id_record_map[record.id] = record

    return id_record_map
Beispiel #7
0
def get_single_genome(alchemist, phageid, get_features=False, data_cache=None):
    gene_query = None
    trna_query = None
    tmrna_query = None
    if get_features:
        gene_query = GENE_QUERY
        trna_query = TRNA_QUERY
        tmrna_query = TMRNA_QUERY

    genome = mysqldb.parse_genome_data(
                            alchemist.engine, phage_id_list=[phageid],
                            phage_query=PHAGE_QUERY, gene_query=gene_query,
                            trna_query=trna_query, tmrna_query=tmrna_query)[0]

    if data_cache is not None:
        data_cache[phageid] = genome

    return genome
Beispiel #8
0
 def test_parse_genome_data_1(self):
     """Verify that a Genome object is constructed correctly for a
     valid PhageID."""
     genome_list = mysqldb.parse_genome_data(self.engine,
                                             phage_id_list=["L5"],
                                             phage_query=self.phage_query,
                                             gnm_type="mysql")
     with self.subTest():
         self.assertEqual(len(genome_list), 1)
     with self.subTest():
         self.assertEqual(genome_list[0].id, "L5")
     with self.subTest():
         self.assertEqual(genome_list[0].seq, "ATCG")
     with self.subTest():
         self.assertEqual(genome_list[0].type, "mysql")
     with self.subTest():
         self.assertEqual(genome_list[0].date, constants.EMPTY_DATE)
     with self.subTest():
         self.assertEqual(len(genome_list[0].cds_features), 0)
Beispiel #9
0
 def test_parse_genome_data_3(self):
     """Verify that a Genome object with CDS features
     is constructed correctly for a valid PhageID."""
     genome_list = mysqldb.parse_genome_data(self.engine,
                                             phage_id_list=["Trixie"],
                                             phage_query=self.phage_query,
                                             gene_query=self.gene_query)
     with self.subTest():
         self.assertEqual(len(genome_list), 1)
     with self.subTest():
         self.assertEqual(genome_list[0].id, "Trixie")
     with self.subTest():
         self.assertEqual(genome_list[0].seq, "AATT")
     with self.subTest():
         self.assertEqual(genome_list[0].type, "")
     with self.subTest():
         self.assertEqual(genome_list[0].date, constants.EMPTY_DATE)
     with self.subTest():
         self.assertEqual(len(genome_list[0].cds_features), 3)
     with self.subTest():
         self.assertEqual(genome_list[0].cds_features[0].genome_length, 4)
Beispiel #10
0
def execute_ffx_export(alchemist,
                       output_path,
                       file_format,
                       db_version,
                       table="phage",
                       values=[],
                       verbose=False):

    if verbose:
        print(f"Retrieving {data_name} data from {sql_handle.database}...")

    if table == "phage":
        genomes = mysqldb.parse_genome_data(alchemist.engine,
                                            phage_id_list=values,
                                            phage_query="SELECT * FROM phage",
                                            gene_query="SELECT * FROM gene")
    else:
        raise ValueError

    if verbose:
        print(f"Converting {data_name} data to SeqRecord format...")
    seqrecords = []

    if table == "phage":
        for gnm in genomes:
            set_cds_seqfeatures(gnm)
            if verbose:
                print(f"Converting {gnm.name}...")
            seqrecords.append(flat_files.genome_to_seqrecord(gnm))
        if verbose:
            print("Appending database version...")
        for record in seqrecords:
            append_database_version(record, db_version)

    else:
        raise ValueError

    write_seqrecord(seqrecords, file_format, output_path, verbose=verbose)
Beispiel #11
0
def main(unparsed_args_list):
    """Run main retrieve_updates pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)
    force = args.force_download
    args.output_folder = basic.set_path(args.output_folder,
                                        kind="dir",
                                        expect=True)
    working_dir = pathlib.Path(RESULTS_FOLDER)
    working_path = basic.make_new_dir(args.output_folder,
                                      working_dir,
                                      attempt=50)

    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)

    # Create config object with data obtained from file and/or defaults.
    config = configfile.build_complete_config(args.config_file)
    mysql_creds = config["mysql"]
    ncbi_creds = config["ncbi"]

    # Verify database connection and schema compatibility.
    print("Preparing genome data sets from the MySQL database...")
    alchemist = AlchemyHandler(database=args.database,
                               username=mysql_creds["user"],
                               password=mysql_creds["password"])
    alchemist.connect(pipeline=True)
    engine = alchemist.engine
    mysqldb.check_schema_compatibility(engine, "the get_data pipeline")

    # Get existing data from MySQL to determine what needs to be updated.
    query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, "
             "DateLastModified, Accession, RetrieveRecord, Subcluster, "
             "AnnotationAuthor FROM phage")

    mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine,
                                                    phage_query=query,
                                                    gnm_type="mysqldb")
    engine.dispose()
    mysqldb_genome_dict = {}
    for gnm in mysqldb_genome_list:
        # With default date, the date of all records retrieved will be newer.
        if force:
            gnm.date = constants.EMPTY_DATE
        mysqldb_genome_dict[gnm.id] = gnm

    # Get data from PhagesDB
    if (args.updates or args.final or args.draft) is True:
        print("Retrieving data from PhagesDB...")
        phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED)
        phagesdb_phages_dict = basic.convert_list_to_dict(
            phagesdb_phages, "phage_name")
        phagesdb_genome_dict = phagesdb.parse_genomes_dict(
            phagesdb_phages_dict, gnm_type="phagesdb", seq=False)

        # Exit if all phage data wasn't retrieved.
        if len(phagesdb_genome_dict) == 0:
            sys.exit(1)

        # Returns a list of tuples.
        tup = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict)
        matched_genomes = tup[0]
        unmatched_phagesdb_ids = tup[1]

    if args.updates is True:
        get_update_data(working_path, matched_genomes)
    if args.final is True:
        get_final_data(working_path, matched_genomes)
    if args.genbank is True:
        get_genbank_data(working_path,
                         mysqldb_genome_dict,
                         ncbi_creds,
                         args.genbank_results,
                         force=force)
    if args.draft is True:
        if force:
            # Add all draft genomes currently in database to the list of
            # draft genomes to be downloaded.
            drafts = get_matched_drafts(matched_genomes)
            unmatched_phagesdb_ids |= drafts
        get_draft_data(working_path, unmatched_phagesdb_ids)
Beispiel #12
0
def main(unparsed_args_list):
    """Run main retrieve_updates pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)
    date = time.strftime("%Y%m%d")

    args.output_folder = basic.set_path(args.output_folder,
                                        kind="dir",
                                        expect=True)

    working_dir = pathlib.Path(f"{date}_get_data")
    working_path = basic.make_new_dir(args.output_folder,
                                      working_dir,
                                      attempt=10)

    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)

    ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file)

    # Verify database connection and schema compatibility.
    print("Preparing genome data sets from the MySQL database...")
    engine = mysqldb.connect_to_db(args.database)
    mysqldb.check_schema_compatibility(engine, "the get_data pipeline")

    # Get existing data from MySQL to determine what needs to be updated.
    query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, "
             "DateLastModified, Accession, RetrieveRecord, Subcluster, "
             "AnnotationAuthor FROM phage")

    mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine,
                                                    phage_query=query,
                                                    gnm_type="mysqldb")
    engine.dispose()
    mysqldb_genome_dict = {}
    for gnm in mysqldb_genome_list:
        mysqldb_genome_dict[gnm.id] = gnm

    # Get data from PhagesDB
    if (args.updates or args.final or args.draft) is True:
        print("Retrieving data from PhagesDB...")
        phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED)
        phagesdb_phages_dict = basic.convert_list_to_dict(
            phagesdb_phages, "phage_name")
        phagesdb_genome_dict = phagesdb.parse_genomes_dict(
            phagesdb_phages_dict, gnm_type="phagesdb", seq=False)

        # Exit if all phage data wasn't retrieved.
        if len(phagesdb_genome_dict) == 0:
            sys.exit(1)

        # Returns a list of tuples.
        match_output = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict)
        matched_genomes = match_output[0]
        unmatched_phagesdb_ids = match_output[1]

    if args.updates is True:
        get_update_data(working_path, matched_genomes)
    if args.final is True:
        get_final_data(working_path, matched_genomes)
    if args.genbank is True:
        get_genbank_data(working_path, mysqldb_genome_dict, ncbi_cred_dict,
                         args.genbank_results)
    if args.draft is True:
        get_draft_data(working_path, unmatched_phagesdb_ids)
    print("\n\n\nRetrieve updates script completed.")