Esempio n. 1
0
def fix_miscolored_phams(engine):
    print("Phixing Phalsely Hued Phams...")
    # Phams which are colored as though they are orphams, when really
    # they are multi-member phams
    query = "SELECT * FROM (SELECT g.PhamID, COUNT(GeneID) AS count, "\
            "p.Color FROM gene AS g INNER JOIN pham AS p ON g.PhamID " \
            "= p.PhamID GROUP BY PhamID) AS c WHERE Color = '#FFFFFF' "\
            "AND count > 1"

    results = mysqldb_basic.query_dict_list(engine, query)


    print(f"Found {len(results)} miscolored phams to fix")

    commands = []
    for dictionary in results:
        pham_id = dictionary["PhamID"]
        count = dictionary["count"]
        color = dictionary["Color"]
        h = s = v = 0
        while h <= 0:
            h = random.random()
        while s < 0.5:
            s = random.random()
        while v < 0.8:
            v = random.random()
        rgb = colorsys.hsv_to_rgb(h, s, v)
        rgb = (rgb[0] * 255, rgb[1] * 255, rgb[2] * 255)
        hexrgb = "#{:02x}{:02x}{:02x}".format(int(rgb[0]), int(rgb[1]),
                                              int(rgb[2]))
        new_color = hexrgb
        commands.append(f"UPDATE pham SET Color = '{new_color}' WHERE PhamID = '{pham_id}'")

    mysqldb.execute_transaction(engine, commands)


    print("Phixing Phalsely Phlagged Orphams...")
    # Phams which are colored as though they are multi-member phams
    # when really they are orphams
    query = "SELECT * FROM (SELECT g.PhamID, COUNT(GeneID) AS count, "\
            "p.Color FROM gene AS g INNER JOIN pham AS p ON g.PhamID "\
            "=p.PhamID GROUP BY PhamID) AS c WHERE Color != '#FFFFFF' "\
            "AND count = 1"

    results = mysqldb_basic.query_dict_list(engine, query)
    print(f"Found {len(results)} miscolored orphams to fix...")

    commands = []
    for dictionary in results:
        pham_id = dictionary["PhamID"]
        count = dictionary["count"]
        color = dictionary["Color"]
        new_color = "#FFFFFF"
        commands.append(f"UPDATE pham SET Color = '{new_color}' WHERE PhamID = '{pham_id}'")

    mysqldb.execute_transaction(engine, commands)
Esempio n. 2
0
def fix_white_phams(engine):
    """
    Find any phams with 2+ members which are colored as though they are
    orphams (#FFFFFF in pham.Color).
    :param engine: sqlalchemy Engine allowing access to the database
    :return:
    """
    query = "SELECT c.PhamID FROM (SELECT g.PhamID, COUNT(GeneID) AS count, "\
            "p.Color FROM gene AS g INNER JOIN pham AS p ON g.PhamID " \
            "= p.PhamID GROUP BY PhamID) AS c WHERE Color = '#FFFFFF' "\
            "AND count > 1"

    results = mysqldb_basic.query_dict_list(engine, query)
    print(f"Found {len(results)} white phams...")

    commands = []
    for dictionary in results:
        pham_id = dictionary["PhamID"]
        h = s = v = 0
        while h <= 0:
            h = random.random()
        while s < 0.5:
            s = random.random()
        while v < 0.8:
            v = random.random()
        rgb = colorsys.hsv_to_rgb(h, s, v)
        rgb = (rgb[0] * 255, rgb[1] * 255, rgb[2] * 255)
        hexrgb = "#{:02x}{:02x}{:02x}".format(int(rgb[0]), int(rgb[1]),
                                              int(rgb[2]))
        new_color = hexrgb.upper()
        commands.append(f"UPDATE pham SET Color = '{new_color}' WHERE "
                        f"PhamID = '{pham_id}'")

    mysqldb.execute_transaction(engine, commands)
Esempio n. 3
0
def fix_colored_orphams(engine):
    """
    Find any single-member phams which are colored as though they are
    multi-member phams (not #FFFFFF in pham.Color).
    :param engine: sqlalchemy Engine allowing access to the database
    :return:
    """
    query = "SELECT * FROM (SELECT g.PhamID, COUNT(GeneID) AS count, " \
            "p.Color FROM gene AS g INNER JOIN pham AS p ON g.PhamID " \
            "=p.PhamID GROUP BY PhamID) AS c WHERE Color != '#FFFFFF' " \
            "AND count = 1"

    results = mysqldb_basic.query_dict_list(engine, query)
    print(f"Found {len(results)} non-white orphams...")

    commands = []
    for dictionary in results:
        pham_id = dictionary["PhamID"]
        count = dictionary["count"]
        color = dictionary["Color"]
        new_color = "#FFFFFF"
        commands.append(f"UPDATE pham SET Color = '{new_color}' WHERE "
                        f"PhamID = '{pham_id}'")

    mysqldb.execute_transaction(engine, commands)
Esempio n. 4
0
def get_pham_genes(engine, phamid):
    """
    Queries the database for the geneids and translations found in the
    indicated pham. Returns a dictionary mapping the pham's geneids to
    their associated translations. All geneid:translation pairs will be
    represented (i.e. no redundant gene filtering is done...).
    :param engine: the Engine allowing access to the database
    :type engine: sqlalchemy Engine
    :param phamid: the pham whose genes are to be returned
    :type phamid: str
    :return: pham_genes
    :rtype: dict
    """
    pham_genes = dict()

    # Query will return the pham's GeneIDs and Translations grouped by genes
    # that share the same sequence
    query = f"SELECT GeneID, Translation FROM gene WHERE PhamID = {phamid} " \
            f"ORDER BY Translation, GeneID ASC"
    query_results = mysqldb_basic.query_dict_list(engine, query)

    for dictionary in query_results:
        geneid = dictionary["GeneID"]
        translation = dictionary["Translation"].decode("utf-8")
        pham_genes[geneid] = translation

    return pham_genes
Esempio n. 5
0
    def test_build_select_3(self):
        """Verify build_select() appends ORDER BY clauses to executable.
        """
        select_query = querying.build_select(self.graph,
                                             self.PhageID,
                                             order_by=self.PhageID)

        dict_list = query_dict_list(self.engine, select_query)

        phage_ids = []
        dict_list = query_dict_list(self.engine, select_query)
        for dict in dict_list:
            phage_ids.append(dict["PhageID"])

        self.assertEqual("Alice", phage_ids[0])
        self.assertTrue("Myrna" in phage_ids)
        self.assertTrue("D29" in phage_ids)
        self.assertTrue("Trixie" in phage_ids)
Esempio n. 6
0
def main(unparsed_args_list):
    """Run main get_gb_records pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)

    # Filters input: phage.Status=draft AND phage.HostGenus=Mycobacterium
    # Args structure: [['phage.Status=draft'], ['phage.HostGenus=Mycobacterium']]
    filters = args.filters
    ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file)
    output_folder = basic.set_path(args.output_folder, kind="dir", expect=True)
    working_dir = pathlib.Path(RESULTS_FOLDER)
    working_path = basic.make_new_dir(output_folder, working_dir, attempt=50)
    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)

    # Verify database connection and schema compatibility.
    print("Connecting to the MySQL database...")
    alchemist = AlchemyHandler(database=args.database)
    alchemist.connect(pipeline=True)
    engine = alchemist.engine
    mysqldb.check_schema_compatibility(engine, "the get_gb_records pipeline")

    # Get SQLAlchemy metadata Table object
    # table_obj.primary_key.columns is a
    # SQLAlchemy ColumnCollection iterable object
    # Set primary key = 'phage.PhageID'
    alchemist.build_metadata()
    table = querying.get_table(alchemist.metadata, TARGET_TABLE)
    for column in table.primary_key.columns:
        primary_key = column

    # Create filter object and then add command line filter strings
    db_filter = Filter(alchemist=alchemist, key=primary_key)
    db_filter.values = []

    # Attempt to add filters and exit if needed.
    add_filters(db_filter, filters)

    # Performs the query
    db_filter.update()

    # db_filter.values now contains list of PhageIDs that pass the filters.
    # Get the accessions associated with these PhageIDs.
    keep_set = set(db_filter.values)

    # Create data sets
    print("Retrieving accessions from the database...")
    query = construct_accession_query(keep_set)
    list_of_dicts = mysqldb_basic.query_dict_list(engine, query)
    id_acc_dict = get_id_acc_dict(list_of_dicts)
    acc_id_dict = get_acc_id_dict(id_acc_dict)
    engine.dispose()
    if len(acc_id_dict.keys()) > 0:
        get_data(working_path, acc_id_dict, ncbi_cred_dict)
    else:
        print("There are no records to retrieve.")
Esempio n. 7
0
    def test_build_count_1(self):
        """Verify build_count() creates valid SQLAlchemy executable.
        """
        count_query = querying.build_count(self.graph, self.PhageID)

        dict_list = query_dict_list(self.engine, count_query)
        count_dict = dict_list[0]

        self.assertTrue(isinstance(count_dict["count_1"], int))
Esempio n. 8
0
    def test_build_distinct_2(self):
        """Verify build_distinct() cannot variable aggregated columns.
        MySQL does not accept DISTINCT queries with aggregated
        and non-aggregated columns.
        """
        distinct_query = querying.build_distinct(
            self.graph, [self.PhageID, func.count(self.Cluster)])

        with self.assertRaises(InternalError):
            dict_list = query_dict_list(self.engine, distinct_query)
Esempio n. 9
0
    def test_build_select_4(self):
        """Verify build_select() handles many-to-one relations as expected.
        build_select() queries should duplicate 'one' when filtering 'many'
        """
        where_clause = (self.Subcluster == "A2")
        select_query = querying.build_select(self.graph,
                                             self.Cluster,
                                             where=where_clause)

        dict_list = query_dict_list(self.engine, select_query)

        self.assertTrue(len(dict_list) > 1)
Esempio n. 10
0
    def test_build_count_2(self):
        """Verify build_count() appends WHERE clauses to executable.
        """
        where_clause = (self.PhageID == "Trixie")
        count_query = querying.build_count(self.graph,
                                           self.PhageID,
                                           where=where_clause)

        dict_list = query_dict_list(self.engine, count_query)
        count_dict = dict_list[0]

        self.assertEqual(count_dict["count_1"], 1)
Esempio n. 11
0
    def test_5_map_geneids_to_translations(self):
        """Verify we get back a dictionary"""
        gs_to_ts = map_geneids_to_translations(self.engine)

        command = "SELECT distinct(GeneID) FROM gene"
        results = mysqldb_basic.query_dict_list(self.engine, command)

        # gs_to_ts should be a dictionary
        with self.subTest():
            self.assertEqual(type(gs_to_ts), type(dict()))
        # gs_to_ts should have the right number of geneids
        with self.subTest():
            self.assertEqual(len(gs_to_ts), len(results))
Esempio n. 12
0
    def test_build_count_3(self):
        """Verify build_count() recognizes multiple inputs as expected.
        """
        where_clause = (self.Cluster == "A")
        count_query = querying.build_count(
            self.graph, [self.PhageID, self.Cluster.distinct()],
            where=where_clause)

        dict_list = query_dict_list(self.engine, count_query)
        count_dict = dict_list[0]

        self.assertTrue(count_dict["count_1"] > 1)
        self.assertEqual(count_dict["count_2"], 1)
Esempio n. 13
0
    def test_6_map_translations_to_geneids(self):
        """Verify we get back a dictionary"""
        ts_to_gs = map_translations_to_geneids(self.engine)

        command = "SELECT distinct(CONVERT(Translation USING utf8)) FROM gene"
        results = mysqldb_basic.query_dict_list(self.engine, command)

        # ts_to_gs should be a dictionary
        with self.subTest():
            self.assertEqual(type(ts_to_gs), type(dict()))
        # ts_to_gs should have the right number of translations
        with self.subTest():
            self.assertEqual(len(ts_to_gs), len(results))
Esempio n. 14
0
    def test_build_select_1(self):
        """Verify build_select() creates valid SQLAlchemy executable.
        """
        select_query = querying.build_select(self.graph, self.PhageID)

        phage_ids = []
        dict_list = query_dict_list(self.engine, select_query)
        for dict in dict_list:
            phage_ids.append(dict["PhageID"])

        self.assertTrue("Myrna" in phage_ids)
        self.assertTrue("D29" in phage_ids)
        self.assertTrue("Trixie" in phage_ids)
Esempio n. 15
0
    def test_build_distinct_1(self):
        """Verify build_distinct() handles many-to-one relations as expected.
        build_distinct() should not duplicate 'one' when handling 'many.'
        """
        where_clause = (self.Subcluster == "A2")
        distinct_query = querying.build_distinct(self.graph,
                                                 self.Cluster,
                                                 where=where_clause)

        dict_list = query_dict_list(self.engine, distinct_query)
        self.assertEqual(len(dict_list), 1)

        distinct_dict = dict_list[0]
        self.assertEqual(distinct_dict["Cluster"], "A")
Esempio n. 16
0
    def test_build_select_2(self):
        """Verify build_select() appends WHERE clauses to executable.
        """
        where_clause = (self.Cluster == "A")
        select_query = querying.build_select(self.graph,
                                             self.PhageID,
                                             where=where_clause)

        phage_ids = []
        dict_list = query_dict_list(self.engine, select_query)
        for dict in dict_list:
            phage_ids.append(dict["PhageID"])

        self.assertTrue("Trixie" in phage_ids)
        self.assertTrue("D29" in phage_ids)
        self.assertFalse("Myrna" in phage_ids)
Esempio n. 17
0
def get_pham_colors(engine):
    """
    Queries the database for the colors of existing phams
    :param engine: the Engine allowing access to the database
    :return: pham_colors
    """
    pham_colors = dict()

    color_query = "SELECT PhamID, Color FROM pham"
    color_results = mysqldb_basic.query_dict_list(engine, color_query)

    for dictionary in color_results:
        pham_id = dictionary["PhamID"]
        color = dictionary["Color"]

        pham_colors[pham_id] = color.upper()

    return pham_colors
Esempio n. 18
0
def get_new_geneids(engine):
    """
    Queries the database for those genes that are not yet phamerated.
    :param engine: the Engine allowing access to the database
    :return: new_geneids
    """
    new_geneids = list()

    gene_query = "SELECT GeneID FROM gene WHERE PhamID IS NULL"
    gene_results = mysqldb_basic.query_dict_list(engine, gene_query)

    # At scale, much cheaper to convert list to set than to build the
    # set one gene at a time
    for dictionary in gene_results:
        geneid = dictionary["GeneID"]
        new_geneids.append(geneid)

    return set(new_geneids)
Esempio n. 19
0
def get_geneids_and_translations(engine):
    """
    Constructs a dictionary mapping all geneids to their translations.
    :param engine: the Engine allowing access to the database
    :return: gs_to_ts
    """
    gs_to_ts = dict()

    query = ("SELECT GeneID, CONVERT(Translation USING utf8) as Translation "
             "FROM gene")
    results = mysqldb_basic.query_dict_list(engine, query)

    for dictionary in results:
        geneid = dictionary["GeneID"]
        translation = dictionary["Translation"]
        gs_to_ts[geneid] = translation

    return gs_to_ts
Esempio n. 20
0
def get_new_geneids(engine):
    """
    Queries the database for those genes that are not yet phamerated.
    :param engine: the Engine allowing access to the database
    :return: new_geneids
    """
    new_geneids = set()

    gene_query = "SELECT GeneID FROM gene WHERE PhamID IS NULL"
    gene_results = mysqldb_basic.query_dict_list(engine, gene_query)

    for dictionary in gene_results:
        geneid = dictionary["GeneID"]
        new_geneids = new_geneids | {geneid}

    print(f"Found {len(new_geneids)} genes not in phams...")

    return new_geneids
Esempio n. 21
0
def get_pham_geneids(engine):
    """
    Queries the database for those genes that are already phamerated.
    :param engine: the Engine allowing access to the database
    :return: pham_geneids
    """
    pham_geneids = dict()

    geneid_query = "SELECT GeneID, PhamID FROM gene WHERE PhamID IS NOT NULL"
    geneid_results = mysqldb_basic.query_dict_list(engine, geneid_query)

    for dictionary in geneid_results:
        pham_id = dictionary["PhamID"]
        geneid = dictionary["GeneID"]

        if pham_id in pham_geneids.keys():
            pham_geneids[pham_id] = pham_geneids[pham_id] | {geneid}
        else:
            pham_geneids[pham_id] = {geneid}

    return pham_geneids
Esempio n. 22
0
def get_translation_groups(engine):
    """
    Constructs a dictionary mapping all unique translations to their
    groups of geneids that share them
    :param engine: the Engine allowing access to the database
    :return: ts_to_gs
    """
    ts_to_gs = dict()

    query = ("SELECT GeneID, CONVERT(Translation USING utf8) as Translation "
             "FROM gene")
    results = mysqldb_basic.query_dict_list(engine, query)

    for dictionary in results:
        geneid = dictionary["GeneID"]
        trans = dictionary["Translation"]
        geneids = ts_to_gs.get(trans, [])
        geneids.append(geneid)
        ts_to_gs[trans] = geneids

    return ts_to_gs
Esempio n. 23
0
def main(argument_list):
    """
    :param argument_list:
    :return:
    """
    # Setup argument parser
    cdd_parser = setup_argparser()

    # Use argument parser to parse argument_list
    args = cdd_parser.parse_args(argument_list)

    # Store arguments in more easily accessible variables
    database = args.database
    cdd_dir = expand_path(args.cdd)
    cdd_name = learn_cdd_name(cdd_dir)
    threads = args.threads
    evalue = args.evalue
    rpsblast = args.rpsblast
    tmp_dir = args.tmp_dir
    output_folder = args.output_folder
    reset = args.reset
    batch_size = args.batch_size

    # Set up directory.
    output_folder = basic.set_path(output_folder, kind="dir", expect=True)
    results_folder = pathlib.Path(RESULTS_FOLDER)
    results_path = basic.make_new_dir(output_folder, results_folder,
                                      attempt=50)
    if results_path is None:
        print("Unable to create output_folder.")
        sys.exit(1)

    log_file = pathlib.Path(results_path, MAIN_LOG_FILE)

    # Set up root logger.
    logging.basicConfig(filename=log_file, filemode="w", level=logging.DEBUG,
                        format="pdm_utils find_domains: %(levelname)s: %(message)s")
    logger.info(f"pdm_utils version: {VERSION}")
    logger.info(f"CDD run date: {constants.CURRENT_DATE}")
    logger.info(f"Command line arguments: {' '.join(argument_list)}")
    logger.info(f"Results directory: {results_path}")

    # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are
    # unable to find one
    if cdd_name == "":
        msg = (f"Unable to learn CDD database name. Make sure the files in "
              f"{cdd_dir} all have the same basename.")
        logger.error(msg)
        print(msg)
        return

    # Get the rpsblast command and path.
    if rpsblast == "":
        command = get_rpsblast_command()
        rpsblast = get_rpsblast_path(command)

    # Verify database connection and schema compatibility.
    alchemist = AlchemyHandler(database=database)
    alchemist.connect(pipeline=True)
    engine = alchemist.engine
    logger.info(f"Connected to database: {database}.")
    mysqldb.check_schema_compatibility(engine, "the find_domains pipeline")
    logger.info(f"Schema version is compatible.")
    logger.info("Command line arguments verified.")

    if reset:
        logger.info("Clearing all domain data currently in the database.")
        clear_domain_data(engine)

    # Get gene data that needs to be processed
    # in dict format where key = column name, value = stored value.
    cdd_genes = mysqldb_basic.query_dict_list(engine, GET_GENES_FOR_CDD)
    msg = f"{len(cdd_genes)} genes to search for conserved domains..."
    logger.info(msg)
    print(msg)

    # Only run the pipeline if there are genes returned that need it
    if len(cdd_genes) > 0:

        log_gene_ids(cdd_genes)
        make_tempdir(tmp_dir)

        # Identify unique translations to process mapped to GeneIDs.
        cds_trans_dict = create_cds_translation_dict(cdd_genes)

        unique_trans = list(cds_trans_dict.keys())
        msg = (f"{len(unique_trans)} unique translations "
               "to search for conserved domains...")
        logger.info(msg)
        print(msg)

        # Process translations in batches. Otherwise, searching could take
        # so long that MySQL connection closes resulting in 1 or more
        # transaction errors.
        batch_indices = basic.create_indices(unique_trans, batch_size)
        total_rolled_back = 0
        for indices in batch_indices:
            start = indices[0]
            stop = indices[1]
            msg = f"Processing translations {start + 1} to {stop}..."
            logger.info(msg)
            print(msg)
            sublist = unique_trans[start:stop]
            batch_rolled_back = search_translations(
                                    rpsblast, cdd_name, tmp_dir, evalue,
                                    threads, engine, sublist, cds_trans_dict)
            total_rolled_back += batch_rolled_back

        search_summary(total_rolled_back)
        engine.dispose()

    return