コード例 #1
0
ファイル: test_tickets.py プロジェクト: stjacqrm/pdm_utils
    def setUp(self):

        self.ticket1 = ticket.ImportTicket()
        self.ticket2 = ticket.ImportTicket()

        self.ticket1.phage_id = "Trixie"
        self.ticket2.phage_id = "L5"

        self.bundle1 = bundle.Bundle()
        self.bundle2 = bundle.Bundle()

        self.bundle1.ticket = self.ticket1
        self.bundle2.ticket = self.ticket2
コード例 #2
0
ファイル: get_data.py プロジェクト: cdshaffer/pdm_utils
def create_phagesdb_ticket(phage_id):
    """Create ImportTicket for PhagesDB genome."""

    # Since the PhagesDB phage has been matched to
    # the MySQL database phage, the AnnotationAuthor field
    # could be assigned from the current mysqldb author
    # variable. However, since this genbank-formatted
    # file is acquired through PhagesDB, both the
    # Annotation status is expected to be 'final' and
    # the Annotation author is expected to be 'hatfull'.
    tkt = ticket.ImportTicket()
    tkt.type = "replace"
    tkt.phage_id = phage_id
    tkt.description_field = "product"
    tkt.eval_mode = "final"
    tkt.data_dict = {
        "host_genus": "retain",  # formerly "retrieve",
        "cluster": "retain",  # formerly "retrieve",
        "subcluster": "retain",  # formerly "retrieve",
        "annotation_status": "final",
        "annotation_author": 1,
        "accession": "retain",  # formerly "retrieve",
        "retrieve_record": 1
    }
    return tkt
コード例 #3
0
ファイル: test_bundle.py プロジェクト: tmavrich/pdm_utils
    def setUp(self):

        self.bndl = bundle.Bundle()
        self.genome1 = genome.Genome()
        self.genome1.type = "flat_file"
        self.genome2 = genome.Genome()
        self.genome2.type = "mysql"
        self.tkt = ticket.ImportTicket()
コード例 #4
0
ファイル: tickets.py プロジェクト: tmavrich/pdm_utils
def parse_import_ticket_data(data_dict):
    """Converts import ticket data to a ImportTicket object.

    :param data_dict:
        A dictionary of data with the following keys:

            0. Import action type
            1. Primary PhageID
            2. Host
            3. Cluster
            4. Subcluster
            5. Status
            6. Annotation Author (int)
            7. Feature field
            8. Accession
            9. Retrieve Record (int)
            10. Eval mode

    :type data_dict: dict
    :returns: A pdm_utils ImportTicket object.
    :rtype: ImportTicket
    """
    ticket_attributes = constants.IMPORT_TABLE_STRUCTURE["valid_ticket"]
    other_attributes = data_dict.keys() - ticket_attributes

    tkt = ticket.ImportTicket()
    tkt.data_dict = data_dict
    for attr in ticket_attributes:
        attr_value = data_dict[attr]
        setattr(tkt, attr, attr_value)

    data_retrieve = set()
    data_retain = set()
    data_add = set()
    data_parse = set()

    other_attributes = list(other_attributes)
    x = 0
    while x < len(other_attributes):
        attr = other_attributes[x]
        attr_value = data_dict[attr]
        if attr_value == "retrieve":
            data_retrieve.add(attr)
        elif attr_value == "retain":
            data_retain.add(attr)
        elif attr_value == "parse":
            data_parse.add(attr)
        else:
            data_add.add(attr)
        x += 1
    tkt.data_retrieve = data_retrieve
    tkt.data_retain = data_retain
    tkt.data_parse = data_parse
    tkt.data_add = data_add

    return tkt
コード例 #5
0
ファイル: test_bundle.py プロジェクト: stjacqrm/pdm_utils
    def setUp(self):

        self.ticket1 = ticket.ImportTicket()

        self.src1 = source.Source()
        self.src1.id = "L5_SRC_1"
        self.src2 = source.Source()
        self.src2.id = "L5_SRC_2"
        self.src3 = source.Source()
        self.src3.id = "L5_SRC_3"

        self.cds1 = cds.Cds()
        self.cds1.id = "L5_CDS_1"
        self.cds2 = cds.Cds()
        self.cds2.id = "L5_CDS_2"
        self.cds3 = cds.Cds()
        self.cds3.id = "L5_CDS_3"

        self.trna1 = trna.Trna()
        self.trna1.id = "L5_TRNA_1"
        self.trna2 = trna.Trna()
        self.trna2.id = "L5_TRNA_2"
        self.trna3 = trna.Trna()
        self.trna3.id = "L5_TRNA_3"

        self.tmrna1 = tmrna.Tmrna()
        self.tmrna1.id = "L5_TMRNA_1"
        self.tmrna2 = tmrna.Tmrna()
        self.tmrna2.id = "L5_TMRNA_2"
        self.tmrna3 = tmrna.Tmrna()
        self.tmrna3.id = "L5_TMRNA_3"

        self.genome1 = genome.Genome()
        self.genome1.type = "flat_file"
        self.genome1.cds_features = [self.cds1, self.cds2]
        self.genome1.source_features = [self.src1, self.src2]
        self.genome1.trna_features = [self.trna1, self.trna2]
        self.genome1.tmrna_features = [self.tmrna1, self.tmrna2]

        self.genome2 = genome.Genome()
        self.genome2.type = "mysql"
        self.genome_pair1 = genomepair.GenomePair()
        self.genome_pair2 = genomepair.GenomePair()
        self.bndl = bundle.Bundle()
        self.bndl.ticket = self.ticket1
        self.bndl.genome_dict[self.genome1.type] = self.genome1
        self.bndl.genome_dict[self.genome2.type] = self.genome2
        self.bndl.genome_pair_dict["genome_pair1"] = self.genome_pair1
        self.bndl.genome_pair_dict["genome_pair2"] = self.genome_pair2

        self.eval_correct1 = evaluation.Evaluation(status="correct")
        self.eval_correct2 = evaluation.Evaluation(status="correct")
        self.eval_error1 = evaluation.Evaluation(status="error")
        self.eval_error2 = evaluation.Evaluation(status="error")
コード例 #6
0
ファイル: test_genomepair.py プロジェクト: tmavrich/pdm_utils
    def setUp(self):
        self.genome1 = genome.Genome()
        self.genome2 = genome.Genome()
        self.tkt = ticket.ImportTicket()
        self.genome_pair = genomepair.GenomePair()
        self.genome_pair.genome1 = self.genome1
        self.genome_pair.genome2 = self.genome2

        self.date_jan1 = datetime.strptime('1/1/2000', '%m/%d/%Y')
        self.date_feb1 = datetime.strptime('2/1/2000', '%m/%d/%Y')
        self.date_feb1_b = datetime.strptime('2/1/2000', '%m/%d/%Y')
コード例 #7
0
ファイル: test_tickets.py プロジェクト: stjacqrm/pdm_utils
 def setUp(self):
     self.data_dict = {}
     self.data_dict["host_genus"] = "Mycobacterium smegmatis"
     self.data_dict["accession"] = "ABC123.1"
     self.data_dict["annotation_status"] = "final"
     self.data_dict["cluster"] = "A"
     self.data_dict["subcluster"] = "A2"
     self.data_dict["annotation_author"] = 1
     self.data_dict["retrieve_record"] = 1
     self.tkt1 = ticket.ImportTicket()
     self.tkt1.phage_id = "Trixie_Draft"
     self.tkt1.data_dict = self.data_dict
コード例 #8
0
ファイル: test_tickets.py プロジェクト: stjacqrm/pdm_utils
    def test_identify_duplicates_6(self):
        """Verify two tickets with multiple duplicates
        do generate multiple errors."""

        ticket1 = ticket.ImportTicket()
        ticket1.id = 1
        ticket1.type = "replace"
        ticket1.phage_id = "Trixie"

        ticket2 = ticket.ImportTicket()
        ticket2.id = 1
        ticket2.type = "replace"
        ticket2.phage_id = "Trixie"

        null_set = set(["none"])
        list_of_tickets = [ticket1, ticket2]
        id_dupes, phage_id_dupes = \
            tickets.identify_duplicates(list_of_tickets, null_set=null_set)
        with self.subTest():
            self.assertEqual(len(id_dupes), 1)
        with self.subTest():
            self.assertEqual(len(phage_id_dupes), 1)
コード例 #9
0
ファイル: test_tickets.py プロジェクト: stjacqrm/pdm_utils
    def test_identify_duplicates_2(self):
        """Verify two tickets with 'none' duplicates
        do not generate an error."""

        ticket1 = ticket.ImportTicket()
        ticket1.id = "none"
        ticket1.type = "replace"
        ticket1.phage_id = "none"

        ticket2 = ticket.ImportTicket()
        ticket2.id = "none"
        ticket2.type = "replace"
        ticket2.phage_id = "none"

        null_set = set(["none"])
        list_of_tickets = [ticket1, ticket2]
        id_dupes, phage_id_dupes = \
            tickets.identify_duplicates(list_of_tickets, null_set=null_set)
        with self.subTest():
            self.assertEqual(len(id_dupes), 0)
        with self.subTest():
            self.assertEqual(len(phage_id_dupes), 0)
コード例 #10
0
ファイル: test_tickets.py プロジェクト: stjacqrm/pdm_utils
    def test_identify_duplicates_1(self):
        """Verify no duplicates are produced."""

        ticket1 = ticket.ImportTicket()
        ticket1.id = 1
        ticket1.type = "replace"
        ticket1.phage_id = "Trixie"

        ticket2 = ticket.ImportTicket()
        ticket2.id = 2
        ticket2.type = "replace"
        ticket2.phage_id = "L5"

        null_set = set(["none"])
        list_of_tickets = [ticket1, ticket2]
        id_dupes, phage_id_dupes = \
            tickets.identify_duplicates(list_of_tickets, null_set=null_set)

        with self.subTest():
            self.assertEqual(len(id_dupes), 0)
        with self.subTest():
            self.assertEqual(len(phage_id_dupes), 0)
コード例 #11
0
def save_files_and_tkts(record_list, accession_dict, output_folder):
    """Save flat files retrieved from GenBank and create import tickets."""
    import_tickets = []
    genome_folder = pathlib.Path(output_folder, GENOMES_DIR)
    genome_folder.mkdir()
    for record in record_list:
        accession = record.name
        accession = accession.split('.')[0]
        gnm = accession_dict[accession]
        ncbi_filename = f"{gnm.name.lower()}__{accession}.gb"
        flatfile_path = pathlib.Path(genome_folder, ncbi_filename)
        SeqIO.write(record, str(flatfile_path), "genbank")

        tkt = ticket.ImportTicket()
        tkt.type = "replace"
        tkt.phage_id = gnm.id
        tkt.data_dict["host_genus"] = gnm.host_genus
        tkt.data_dict["cluster"] = gnm.cluster
        tkt.data_dict["subcluster"] = gnm.subcluster
        tkt.data_dict["annotation_status"] = gnm.annotation_status
        tkt.data_dict["annotation_author"] = gnm.annotation_author
        tkt.description_field = "product"
        # Accession is set to 'parse' to ensure that during import,
        # the file's accession is directly compared to the database
        # record's accession.
        # tkt.data_dict["accession"] = gnm.accession
        tkt.data_dict["accession"] = "parse"
        tkt.eval_mode = "auto"
        # TODO secondary_phage_id data is for old ticket format.
        tkt.data_dict["secondary_phage_id"] = gnm.id
        tkt.data_dict["retrieve_record"] = 1
        import_tickets.append(tkt)

    # Now make the import table.
    if len(import_tickets) > 0:
        filepath = basic.prepare_filepath(output_folder,
                                          "legacy_import_table.csv")
        import_tickets1 = convert_tickets_to_dict(import_tickets,
                                                  old_format=True)
        basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1)

        # TODO new dictwriter. Use this block instead of above once the
        # new import script is functioning.
        if BOTH:
            filepath2 = basic.prepare_filepath(output_folder,
                                               "import_table.csv")
            import_tickets2 = convert_tickets_to_dict(import_tickets)
            basic.export_data_dict(import_tickets2,
                                   filepath2,
                                   IMPORT_COLUMNS2,
                                   include_headers=True)
コード例 #12
0
ファイル: get_data.py プロジェクト: cdshaffer/pdm_utils
def create_draft_ticket(name):
    """Create ImportTicket for draft genome."""
    tkt = ticket.ImportTicket()
    tkt.type = "add"
    tkt.phage_id = name
    tkt.description_field = "product"
    tkt.eval_mode = "draft"
    tkt.data_dict = {
        "host_genus": "retrieve",
        "cluster": "retrieve",
        "subcluster": "retrieve",
        "annotation_status": "draft",
        "annotation_author": 1,
        "accession": "none",
        "retrieve_record": 1
    }
    return tkt
コード例 #13
0
ファイル: get_data.py プロジェクト: cdshaffer/pdm_utils
def create_genbank_ticket(gnm):
    """Create ImportTicket for GenBank record."""
    # Accession is set to 'parse' to ensure that during import,
    # the file's accession is directly compared to the database
    # record's accession.
    tkt = ticket.ImportTicket()
    tkt.type = "replace"
    tkt.phage_id = gnm.id
    tkt.description_field = "product"
    tkt.eval_mode = "auto"
    tkt.data_dict = {
        "host_genus": "retain",  # formerly gnm.host_genus,
        "cluster": "retain",  # formerly gnm.cluster,
        "subcluster": "retain",  # formerly gnm.subcluster,
        "annotation_status": "retain",  # formerly gnm.annotation_status,
        "annotation_author": "retain",  # formerly gnm.annotation_author,
        "accession": "parse",
        "retrieve_record": "retain",  # formerly 1
    }
    return tkt
コード例 #14
0
ファイル: test_bundle.py プロジェクト: tmavrich/pdm_utils
    def setUp(self):

        self.ticket1 = ticket.ImportTicket()
        self.src1 = source.Source()
        self.src1.id = "L5_SRC_1"
        self.src2 = source.Source()
        self.src2.id = "L5_SRC_2"
        self.src3 = source.Source()
        self.src3.id = "L5_SRC_3"
        self.cds1 = cds.Cds()
        self.cds1.id = "L5_CDS_1"
        self.cds2 = cds.Cds()
        self.cds2.id = "L5_CDS_2"
        self.cds3 = cds.Cds()
        self.cds3.id = "L5_CDS_3"
        self.genome1 = genome.Genome()
        self.genome1.type = "flat_file"
        self.genome1.cds_features.append(self.cds1)
        self.genome1.cds_features.append(self.cds2)
        self.genome1.source_features.append(self.src1)
        self.genome1.source_features.append(self.src2)
        self.genome2 = genome.Genome()
        self.genome2.type = "mysql"
        self.genome_pair1 = genomepair.GenomePair()
        self.genome_pair2 = genomepair.GenomePair()
        self.bndl = bundle.Bundle()
        self.bndl.ticket = self.ticket1
        self.bndl.genome_dict[self.genome1.type] = self.genome1
        self.bndl.genome_dict[self.genome2.type] = self.genome2
        self.bndl.genome_pair_dict["genome_pair1"] = self.genome_pair1
        self.bndl.genome_pair_dict["genome_pair2"] = self.genome_pair2

        self.eval_correct1 = eval.Eval(status="correct")
        self.eval_correct2 = eval.Eval(status="correct")
        self.eval_error1 = eval.Eval(status="error")
        self.eval_error2 = eval.Eval(status="error")
コード例 #15
0
ファイル: test_ticket.py プロジェクト: tmavrich/pdm_utils
    def setUp(self):

        # Empty ticket to test simple methods
        self.tkt = ticket.ImportTicket()
コード例 #16
0
def get_final_data(output_folder, matched_genomes):
    """Run sub-pipeline to retrieve 'final' genomes from PhagesDB."""

    phagesdb_folder = pathlib.Path(output_folder, "phagesdb")
    phagesdb_folder.mkdir()
    genome_folder = pathlib.Path(phagesdb_folder, GENOMES_DIR)
    genome_folder.mkdir()
    import_tickets = []
    failed_list = []

    # Iterate through each phage in the MySQL database
    for gnm_pair in matched_genomes:
        mysqldb_gnm = gnm_pair.genome1
        phagesdb_gnm = gnm_pair.genome2

        # Not all phages have associated Genbank-formatted files
        # available on PhagesDB. Check to see if there is a flatfile for
        # this phage. Download the flatfile only if there is a date tag,
        # and only if that date is more recent than the date stored in
        # the MySQL database for that genome. The tagged date only reflects when
        # the file was uploaded into PhagesDB. The date the actual
        # Genbank record was created is stored within the file,
        # and this too could be less recent than the current version in
        # the MySQL database; however, this part gets checked during the import
        # stage.
        set_phagesdb_gnm_date(phagesdb_gnm)
        set_phagesdb_gnm_file(phagesdb_gnm)
        if (phagesdb_gnm.filename != ""
                and phagesdb_gnm.date > mysqldb_gnm.date):
            # Save the file on the hard drive with the same name as
            # stored on PhagesDB
            flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename)
            if flatfile_data == "":
                failed_list.append(mysqldb_gnm.id)
            else:
                flatfile_filename = phagesdb_gnm.filename.split("/")[-1]
                flatfile_path = pathlib.Path(genome_folder, flatfile_filename)
                with flatfile_path.open("w") as fh:
                    fh.write(flatfile_data)
                # Create the new import ticket
                # Since the PhagesDB phage has been matched to
                # the MySQL database phage, the AnnotationAuthor field
                # could be assigned from the current mysqldb author
                # variable. However, since this genbank-formatted
                # file is acquired through PhagesDB, both the
                # Annotation status is expected to be 'final' and
                # the Annotation author is expected to be 'hatfull'.
                tkt = ticket.ImportTicket()
                tkt.type = "replace"
                tkt.phage_id = mysqldb_gnm.id
                tkt.data_dict["host_genus"] = "retrieve"
                tkt.data_dict["cluster"] = "retrieve"
                tkt.data_dict["subcluster"] = "retrieve"
                tkt.data_dict["annotation_status"] = "final"
                tkt.data_dict["annotation_author"] = 1
                tkt.description_field = "product"
                tkt.data_dict["accession"] = "retrieve"
                tkt.eval_mode = "final"
                # TODO secondary_phage_id data is for old ticket format.
                tkt.data_dict["secondary_phage_id"] = mysqldb_gnm.id
                tkt.data_dict["retrieve_record"] = 1
                import_tickets.append(tkt)

    count1 = len(import_tickets)
    if count1 > 0:
        print(f"\n\n{count1} phage(s) were retrieved from PhagesDB.")
        filepath = basic.prepare_filepath(phagesdb_folder,
                                          "legacy_import_table.csv")
        import_tickets1 = convert_tickets_to_dict(import_tickets,
                                                  old_format=True)
        basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1)

        # TODO new dictwriter. Use this block instead of above once the
        # new import script is functioning.
        if BOTH:
            filepath2 = basic.prepare_filepath(phagesdb_folder,
                                               "import_table.csv")
            import_tickets2 = convert_tickets_to_dict(import_tickets)
            basic.export_data_dict(import_tickets2,
                                   filepath2,
                                   IMPORT_COLUMNS2,
                                   include_headers=True)

    if len(failed_list) > 0:
        print(f"{len(failed_list)} phage(s) failed to be retrieved:")
        for element in failed_list:
            print(element)
        input("\n\nPress ENTER to continue.")

    # Now remove empty folders.
    if len(basic.identify_contents(genome_folder, kind=None)) == 0:
        genome_folder.rmdir()
    if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0:
        phagesdb_folder.rmdir()
コード例 #17
0
def retrieve_drafts(output_folder, phage_list):
    """Retrieve auto-annotated 'draft' genomes from PECAAN."""

    print(f"\n\nRetrieving {len(phage_list)} new phages from PECAAN")
    genome_folder = pathlib.Path(output_folder, GENOMES_DIR)
    genome_folder.mkdir()

    # Keep track of how many genomes were retrieved from PECAAN
    retrieved_tally = 0
    failed_list = []
    import_tickets = []

    # Iterate through each row in the file
    for new_phage in phage_list:
        pecaan_link = constants.PECAAN_PREFIX + new_phage
        response = phagesdb.retrieve_url_data(pecaan_link)
        if response == "":
            print(f"Error: unable to retrieve {new_phage} draft genome.")
            print(pecaan_link)
            failed_list.append(new_phage)
        else:
            pecaan_filename = f"{new_phage}.txt"
            pecaan_filepath = pathlib.Path(genome_folder, pecaan_filename)
            with pecaan_filepath.open("w") as fh:
                fh.write(response)

            tkt = ticket.ImportTicket()
            tkt.type = "add"
            tkt.phage_id = new_phage
            tkt.data_dict["host_genus"] = "retrieve"
            tkt.data_dict["cluster"] = "retrieve"
            tkt.data_dict["subcluster"] = "retrieve"
            tkt.data_dict["annotation_status"] = "draft"
            tkt.data_dict["annotation_author"] = 1
            tkt.description_field = "product"
            tkt.data_dict["accession"] = "none"
            tkt.eval_mode = "draft"
            # TODO secondary_phage_id data is for old ticket format.
            tkt.data_dict["secondary_phage_id"] = "none"
            tkt.data_dict["retrieve_record"] = 1
            import_tickets.append(tkt)

            print(f"{new_phage} retrieved from PECAAN.")
            retrieved_tally += 1

    # Now make the import table.
    if len(import_tickets) > 0:
        filepath = basic.prepare_filepath(output_folder,
                                          "legacy_import_table.csv")
        import_tickets1 = convert_tickets_to_dict(import_tickets,
                                                  old_format=True)
        basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1)

        # TODO new dictwriter. Use this block instead of above once the
        # new import script is functioning.
        if BOTH:
            filepath2 = basic.prepare_filepath(output_folder,
                                               "import_table.csv")
            import_tickets2 = convert_tickets_to_dict(import_tickets)
            basic.export_data_dict(import_tickets2,
                                   filepath2,
                                   IMPORT_COLUMNS2,
                                   include_headers=True)

    # Report results
    if retrieved_tally > 0:
        print(f"{retrieved_tally} phage(s) were successfully retrieved")

    if len(failed_list) > 0:
        print(f"{len(failed_list)} phage(s) failed to be retrieved:")
        for element in failed_list:
            print(element)
        input("\n\nPress ENTER to continue.")