def __init__(self, user, hostname, db, password, ftp_download_date,
                 repository, path_to_log, cpus):

        self.repository = repository
        # By default we set the id to genbank (it is either 2 or 3
        self.id_database = 3
        if repository == "refseq":
            self.id_database = 2
        self.domains = ["archaea", "bacteria"]
        self.report_database_update = open(
            os.path.join(
                path_to_log,
                "report_{0}_{1}_update_db.log".format(repository,
                                                      ftp_download_date)), "w")

        self.password = password
        self.hostname = hostname
        self.user = user
        self.db = db

        self.temp_con = GenomeDatabaseConnectionFTPUpdate.GenomeDatabaseConnectionFTPUpdate(
            hostname, user, password, db)
        self.temp_con.MakePostgresConnection()
        self.temp_cur = self.temp_con.cursor()
        self.cpus = cpus

        self.logger = logging.getLogger('timestamp')
    def worker_addOrVersionNewGenomes(self, queue_in, process_idx, tasklist,
                                      list_report, queue_out):
        list_sql = []
        thread_con = GenomeDatabaseConnectionFTPUpdate.GenomeDatabaseConnectionFTPUpdate(
            self.hostname, self.user, self.password, self.db)
        thread_con.MakePostgresConnection()
        thread_cur = thread_con.cursor()

        while True:
            checkm_record = queue_in.get(block=True, timeout=None)
            if checkm_record == None:
                break
            if (checkm_record not in self.dict_existing_records) and (
                    checkm_record in self.genome_dirs_dict):
                check_record_base = checkm_record.rsplit(".", 1)[0]
                id_record = self._checkPreviousVersion(thread_cur,
                                                       check_record_base)
                if id_record < 0:  # -1
                    # we add the genome to the database
                    list_sql = self._addNewGenomes(checkm_record, list_report,
                                                   list_sql)
                else:
                    list_sql = self._addNewGenomes(checkm_record, list_report,
                                                   list_sql, id_record)
            queue_out.put(checkm_record)
        tasklist.append(list_sql)
Exemple #3
0
    def __init__(self,hostname,user,password,db):
        """Initialization."""
        self.logger = logging.getLogger('timestamp')
        self.description_table = {'metadata_gene.tsv':['metadata_gene.desc.tsv'],
                                  'metadata_nt.tsv':['metadata_nt.desc.tsv'],
                                  'metadata_ssu_gg.tsv':['metadata_rna.table.desc.tsv'],
                                  'metadata_ssu_silva.tsv':['metadata_rna.table.desc.tsv','metadata_sequence.desc.tsv'],
                                  'metadata_lsu_silva_23s.tsv':['metadata_rna.table.desc.tsv','metadata_sequence.desc.tsv'],
                                  'metadata_lsu_5S.tsv':['metadata_rna.table.desc.tsv','metadata_sequence.desc.tsv'],
                                  'metadata_ssu_silva_count.tsv':['metadata_ssu_count.desc.tsv'],
                                  'metadata_lsu_silva_23s_count.tsv':['metadata_ssu_count.desc.tsv'],
                                  'metadata_lsu_5S_count.tsv':['metadata_ssu_count.desc.tsv'],
                                  'metadata_trna_count.tsv':['metadata_trna.desc.tsv'],
                                  'ncbi_assembly_summary.tsv':['metadata_ncbi_assembly_file.desc.tsv'],
                                  'strain_summary_file.tsv':['metadata_ncbi_assembly.desc.tsv','metadata_ncbi_assembly_file.desc.tsv'],
                                  'ncbi_assembly_metadata.tsv':['metadata_ncbi_assembly.desc.tsv']
                                  }

        self.password = password
        self.hostname = hostname
        self.user = user
        self.db = db

        self.temp_con = GenomeDatabaseConnectionFTPUpdate.GenomeDatabaseConnectionFTPUpdate(
            hostname, user, password, db)
        self.temp_con.MakePostgresConnection()
        self.temp_cur = self.temp_con.cursor()
    def task_sql_command(self, list_sql, process_idx):
        thread_con = GenomeDatabaseConnectionFTPUpdate.GenomeDatabaseConnectionFTPUpdate(
            self.hostname, self.user, self.password, self.db)
        thread_con.MakePostgresConnection()
        thread_cur = thread_con.cursor()

        list_subcommands = list(self.chunks(list_sql, 10))

        for subsql in atpbar(list_subcommands,
                             name="Process-{}".format(process_idx)):
            big_sql_command = ';'.join(subsql)
            thread_cur.execute(big_sql_command)
        thread_con.commit()
Exemple #5
0
    def __init__(self,hostname,user,password,db):
        """Initialization."""
        self.logger = logging.getLogger('timestamp')

        self.password = password
        self.hostname = hostname
        self.user = user
        self.db = db

        self.temp_con = GenomeDatabaseConnectionFTPUpdate.GenomeDatabaseConnectionFTPUpdate(
            hostname, user, password, db)
        self.temp_con.MakePostgresConnection()
        self.temp_cur = self.temp_con.cursor()
Exemple #6
0
    def __init__(self,hostname=None,user=None,password=None,db=None):
        self.password = password
        self.hostname = hostname
        self.user = user
        self.db = db

        self.logger = logging.getLogger('timestamp')

        self.DEFAULT_DOMAIN_THRESHOLD = 10.0


        if db is not None:
            self.temp_con = GenomeDatabaseConnectionFTPUpdate.GenomeDatabaseConnectionFTPUpdate(
                hostname, user, password, db)
            self.temp_con.MakePostgresConnection()
            self.temp_cur = self.temp_con.cursor()
    def populate_names_dmp_table(self,hostname, user, password, db,taxonomy_dir,
         refseq_archaea_assembly_file,
         refseq_bacteria_assembly_file,
         genbank_archaea_assembly_file,
         genbank_bacteria_assembly_file,output_prefix):

        temp_con = GenomeDatabaseConnectionFTPUpdate.GenomeDatabaseConnectionFTPUpdate(
            hostname, user, password, db)
        temp_con.MakePostgresConnection()
        temp_cur = temp_con.cursor()

        """Read NCBI taxonomy information and create summary output files."""

        output_prefix ="test"

        # parse organism name
        self._assembly_organism_name(refseq_archaea_assembly_file,
                                     refseq_bacteria_assembly_file,
                                     genbank_archaea_assembly_file,
                                     genbank_bacteria_assembly_file,
                                     output_prefix + '_organism_names.tsv')

        # parse metadata file and taxonomy files
        assembly_to_tax_id = self._assembly_to_tax_id(refseq_archaea_assembly_file,
                                                      refseq_bacteria_assembly_file,
                                                      genbank_archaea_assembly_file,
                                                      genbank_bacteria_assembly_file)

        node_records = self._read_nodes(
            os.path.join(taxonomy_dir, 'nodes.dmp'))
        print('Read %d node records.' % len(node_records))

        name_records = self._read_names(
            os.path.join(taxonomy_dir, 'names.dmp'))
        print('Read %d name records.' % len(name_records))

        # traverse taxonomy tree for each assembly
        taxonomy_file = output_prefix + '_unfiltered_taxonomy.tsv'
        #fout = open(taxonomy_file, 'w')
        list_ranks_taxonomy = []

        print('Number of assemblies: %d' % len(assembly_to_tax_id))
        d={}
        for assembly_accession, tax_id in assembly_to_tax_id.items():
            d[assembly_accession] ={}
            # traverse taxonomy tree to the root which is 'cellular organism' for genomes,
            # 'other sequences' for plasmids, and 'unclassified sequences' for metagenomic libraries
            taxonomy = []
            cur_tax_id = tax_id

            if cur_tax_id not in name_records:
                print('[Warning] Assembly %s has an invalid taxid: %s' % (assembly_accession, tax_id))
                continue

            roots = ['cellular organisms', 'other sequences',
                     'unclassified sequences', 'Viruses', 'Viroids']
            while name_records[cur_tax_id].name_txt not in roots:
                if cur_tax_id == '1':
                    print('[Error] TaxId %s reached root of taxonomy tree: %s' % (tax_id, taxonomy))
                    sys.exit(-1)

                try:
                    node_record = node_records[cur_tax_id]

                    if node_record.rank in Taxonomy.rank_labels:
                        rank_index = Taxonomy.rank_labels.index(
                            node_record.rank)
                        rank_prefix = Taxonomy.rank_prefixes[rank_index]
                    elif node_record.rank == 'subspecies':
                        rank_prefix = 'sb__'
                    else:
                        # unrecognized rank
                        rank_prefix = 'x__'
                        if node_record.rank == 'superkingdom':
                            rank_prefix = 'd__'

                    taxonomy.append((
                        rank_prefix + name_records[cur_tax_id].name_txt,cur_tax_id))
                    d[assembly_accession][rank_prefix + name_records[cur_tax_id].name_txt] = cur_tax_id
                    cur_tax_id = node_record.parent_tax_id
                except:
                    print(traceback.format_exc())
                    print(taxonomy)

            list_ranks_taxonomy.extend(taxonomy)

        only_names,only_taxid = zip(*set(list_ranks_taxonomy))
        print(Counter(only_names).most_common(5))

        with open('taxids.json', 'w') as outfile:
            json.dump(d, outfile)