def read_configs(schema_path, filename):
    """ Reads file with schema config values.

        Parameters
        ----------
        schema_path : str
            Path to the schema's directory.
        filename : str
            Name of the file that contains the config values.

        Returns
        -------
        configs : dict
            Dictionary with config names as keys and config
            values as values.
    """

    config_file = os.path.join(schema_path, filename)
    if os.path.isfile(config_file):
        # Load configs dictionary
        configs = fo.pickle_loader(config_file)
    else:
        sys.exit('Could not find a valid config file.')

    return configs
Esempio n. 2
0
def save_extracted_cds(genome, identifier, orf_file, protein_table, cds_file):
    """ Extracts coding sequences from a genome assembly based
        on Prodigal's gene predictions. Writes coding sequences
        to a FASTA file and information about coding sequences to
        a TSV file.

        Parameters
        ----------
        genome : str
            Path to the FASTA file with the FASTA sequences for
            a genome.
        identifier : str
            Genome identifier to add to FASTA records headers
            and to the first field in the TSV file.
        orf_file : str
            Path to the file with Prodigal results.
        protein_table : str
            Path to the TSV file to which coding sequences
            information will be written.
        cds_file : str
            Path to the FASTA file to which coding sequences
            will be written.

        Returns
        -------
        total_cds : int
            Total number of coding sequences extracted from
            the genome.
    """

    # import contigs for current genome/assembly
    contigs = fao.import_sequences(genome)
    # extract coding sequences from contigs
    reading_frames = fo.pickle_loader(orf_file)
    genome_info = extract_genome_cds(reading_frames, contigs, 1)
    # save coding sequences to file
    # create records and write them to file
    cds_lines = fao.create_fasta_lines(genome_info[0], identifier)
    fo.write_lines(cds_lines, cds_file)

    write_protein_table(protein_table, identifier, genome_info[1])

    total_cds = len(genome_info[0])

    return total_cds
Esempio n. 3
0
def main(input_files, output_directory, protein_table, blast_score_ratio,
         cpu_cores, taxa, proteome_matches, no_cleanup, blast_path):

    # create output directory
    fo.create_directory(output_directory)

    # create temp directory
    temp_directory = fo.join_paths(output_directory, ['temp'])
    fo.create_directory(temp_directory)

    # validate input files
    genes_list = fo.join_paths(temp_directory, ['listGenes.txt'])
    genes_list = pv.check_input_type(input_files, genes_list)
    loci_paths = fo.read_lines(genes_list)

    schema_directory = os.path.dirname(loci_paths[0])
    schema_basename = fo.file_basename(schema_directory)
    print('Schema: {0}'.format(schema_directory))
    print('Number of loci: {0}'.format(len(loci_paths)))

    # find annotations based on reference proteomes for species
    proteome_results = {}
    if taxa is not None:
        proteome_results = proteome_annotations(schema_directory,
                                                temp_directory,
                                                taxa,
                                                blast_score_ratio,
                                                cpu_cores,
                                                proteome_matches,
                                                blast_path)

    # find annotations in SPARQL endpoint
    print('\nQuerying UniProt\'s SPARQL endpoint...')
    config_file = fo.join_paths(input_files, '.schema_config')
    if os.path.isfile(config_file) is True:
        config = fo.pickle_loader(config_file)
        translation_table = config.get('translation_table', [11])[0]
    else:
        translation_table = 11
    sparql_results = sparql_annotations(loci_paths,
                                        translation_table,
                                        cpu_cores)

    loci_info = {}
    if protein_table is not None:
        # read cds_info table
        # read "cds_info.tsv" file created by CreateSchema
        table_lines = fo.read_tabular(protein_table)
        for l in table_lines[1:]:
            # create locus identifier based on genome identifier and
            # cds identifier in file
            locus_id = l[0].replace('_', '-')
            locus_id = locus_id + '-protein{0}'.format(l[-2])
            loci_info[locus_id] = l

    annotations = join_annotations(sparql_results, proteome_results, loci_info)

    # table header
    header = ['Locus_ID']
    if len(loci_info) > 0:
        header += table_lines[0]

    header += ['Uniprot_Name', 'UniProt_URL']

    if len(proteome_results) > 0:
        header.extend(['Proteome_ID', 'Proteome_Product',
                       'Proteome_Gene_Name', 'Proteome_Species',
                       'Proteome_BSR'])

    loci_info_bool = True if len(loci_info) > 0 else False
    output_table = create_annotations_table(annotations, output_directory,
                                            header, schema_basename,
                                            loci_info_bool)

    if no_cleanup is False:
        shutil.rmtree(temp_directory)

    print('\n\nThe table with new information can be found at:'
          '\n{0}'.format(output_table))
Esempio n. 4
0
def main(schema_directory, cpu_cores, nomenclature_server, submit, blast_path,
         update_profiles):

    # get ns configs
    local_date, schema_uri = pv.read_configs(schema_directory, '.ns_config')
    # get schema and species identifiers
    schema_id = schema_uri.split('/')[-1]
    species_id = schema_uri.split('/')[-3]
    if nomenclature_server is None:
        nomenclature_server = schema_uri.split('species/')[0]

    if submit is True and 'tutorial' not in nomenclature_server:
        print('\nOnly authorized registered users may submit new alleles.')
        token = cr.capture_login_credentials(nomenclature_server)
    else:
        token = ''

    # GET request headers
    headers_get = ct.HEADERS_GET_JSON
    headers_get['Authorization'] = token

    # determine current user ID and Role
    if submit is True and 'tutorial' not in nomenclature_server:
        user_id, user_role, user_auth = cr.user_info(nomenclature_server,
                                                     headers_get)
        # verify if user has authorization to submit
        url = cr.make_url(nomenclature_server, 'auth', 'check')
        response = cr.simple_get_request(url, headers_get)[1]
        if response.status_code == 200:
            user_auth = True
        else:
            sys.exit('Current user has no authorization to submit novel '
                     'alleles.\nYou can request authorization to submit '
                     'novel alleles by sending an e-mail to: '
                     '*****@*****.**')
        print('User id: {0}'.format(user_id))
        print('User role: {0}\n'.format(user_role))
    else:
        user_id = ''
        user_role = ''
        user_auth = True if 'tutorial' in nomenclature_server else False

    # POST requests headers
    headers_post = ct.HEADERS_POST_JSON
    headers_post['Authorization'] = token
    headers_post['user_id'] = user_id
    # POST headers to send binary data
    headers_post_bytes = ct.HEADERS_POST
    headers_post_bytes['Authorization'] = token
    headers_post_bytes['user_id'] = user_id

    schema_params = pv.read_configs(schema_directory, '.schema_config')

    # verify that local configs have a single value per parameter
    if all([
            len(schema_params[k]) == 1 for k in schema_params
            if k != 'chewBBACA_version'
    ]) is not True:
        sys.exit('Cannot sync schema with multiple values per parameter.')

    # check if schema exists in the NS
    schema_name, ns_params = cr.get_species_schemas(schema_id, species_id,
                                                    nomenclature_server,
                                                    headers_get)[2:]

    # verify that local configs match NS configs
    # add window size
    if all([
            str(schema_params[k][0]) == ns_params[k]['value']
            for k in schema_params
            if k not in ['chewBBACA_version', 'window_size']
    ]) is not True:
        sys.exit('Local configs do not match Chewie-NS configs.')

    # Get the name of the species from the provided id
    # or vice-versa
    species_id, species_name = cr.species_ids(species_id, nomenclature_server,
                                              headers_get)

    print('Schema id: {0}'.format(schema_id))
    print('Schema name: {0}'.format(schema_name))
    print("Schema's species: {0} (id={1})".format(species_name, species_id))
    print('Last synced: {0}'.format(local_date))

    # get last modification date
    # setting syncing date to last modification date will allow
    # all users to sync even when the schema is locked and being
    # updated by another user
    ns_date = ns_params['last_modified']['value']
    print('\nRemote schema was last modified on: {0}'.format(ns_date))

    # exit if remote schema has not been updated since last
    # sync date and current user does not wish to submit new alleles
    if local_date == ns_date and submit is False:
        sys.exit('\nRemote schema has not been updated since last sync '
                 'process. Local schema is up-to-date.')

    # Create a temporary dir for the new alleles
    temp_dir = os.path.join(os.path.dirname(schema_directory), 'temp')
    if not os.path.exists(temp_dir):
        os.mkdir(temp_dir)

    # retrieve alleles added to schema after last sync date
    print('\nRetrieving alleles added to remote schema '
          'after {0}...'.format(local_date))
    loci_alleles, server_time, count = retrieve_latest(local_date, schema_uri,
                                                       headers_get, ns_date)

    print('Retrieved {0} alleles for {1} loci.'
          ''.format(count, len(loci_alleles)))

    # Get schema files from genes list file
    genes_list = os.path.join(schema_directory, '.genes_list')
    genes = fo.pickle_loader(genes_list)

    # update loci structure
    not_in_ns, pickled_loci, \
        updated, not_update, \
        rearranged = update_loci_files(loci_alleles, genes,
                                       schema_directory, temp_dir)

    total_local = sum([len(v[0]) for k, v in not_in_ns.items()])
    print('Local schema has {0} novel alleles for {1} '
          'loci.'.format(total_local, len(not_in_ns)))

    # check if there are any changes to make
    if len(pickled_loci) == 0:
        shutil.rmtree(temp_dir)
        sys.exit('Remote schema has not been altered and local schema '
                 'does not have novel alleles.')

    results = {}
    attributed = 0
    if submit is True and user_auth is True and len(not_in_ns) > 0:

        # attempt to lock schema
        lock_res = cr.simple_post_request(
            nomenclature_server,
            headers_post,
            ['species', species_id, 'schemas', schema_id, 'lock'],
            data=json.dumps({'action': 'lock'}))[1]
        # if schema is already locked user cannot send alleles
        lock_status = lock_res.status_code
        if lock_status == 403:
            print('Schema is already locked. Another user might be updating '
                  'the schema. Please repeat the syncing process after a '
                  'while to add your new alleles to the Chewie-NS.\n The '
                  'process will now update your local schema with the alleles '
                  'retrieved from the Chewie-NS.')
        else:

            # after locking, check if date matches ns_date
            date_res = cr.simple_get_request(
                nomenclature_server, headers_get,
                ['species', species_id, 'schemas', schema_id, 'modified'])[1]

            date_value = (date_res.json()).split(' ')[-1]

            if date_value != ns_date:
                print('Data retrieved from the Chewie-NS has an older '
                      'timestamp than current schema timestamp. Schema '
                      'might have been updated before this syncing process. '
                      'Please repeat the syncing process in order to add '
                      'your new alleles to the schema. The process will now '
                      'update your local schema with the alleles retrieved '
                      'from the Chewie-NS.')

                # unlock schema
                lock_res = cr.simple_post_request(
                    nomenclature_server,
                    headers_post,
                    ['species', species_id, 'schemas', schema_id, 'lock'],
                    data=json.dumps({'action': 'unlock'}))[1]
            else:
                print(
                    'Collecting data and creating files to submit local alleles...'
                )
                # get list of loci for schema in the NS
                loci_res = cr.simple_get_request(
                    nomenclature_server, headers_get,
                    ['species', species_id, 'schemas', schema_id, 'loci'])[1]
                # get loci files names from response
                for l in loci_res.json()['Loci']:
                    locus_name = l['name']['value'] + '.fasta'
                    locus_uri = l['locus']['value']
                    if locus_name in not_in_ns:
                        not_in_ns[locus_name].append(locus_uri)

                # create files with length values to update
                length_files = create_lengths_files(not_in_ns, temp_dir)

                # create new alleles data
                alleles_files, \
                    loci_ids, \
                    loci_names = create_alleles_files(not_in_ns, nomenclature_server,
                                                      user_id, species_name,
                                                      species_id, schema_id,
                                                      temp_dir)

                # compress files with new alleles
                zipped_files = [
                    '{0}.zip'.format(file) for file in alleles_files
                ]
                list(map(fo.file_zipper, alleles_files, zipped_files))
                alleles_data = list(zip(zipped_files, loci_ids, loci_names))

                print('Sending and inserting new alleles...')
                failed, \
                    start_count = upload_alleles_data(alleles_data, length_files,
                                                      nomenclature_server, headers_post,
                                                      headers_post_bytes, species_id,
                                                      schema_id)

                # track progress through endpoint
                # set time limit for task completion (seconds)
                print()
                time_limit = 2100
                current_time = 0
                status = 'Updating'
                start_count = int(start_count.json()['nr_alleles'])
                while status != 'Complete' and (current_time < time_limit):
                    insertion_status = cr.simple_get_request(
                        nomenclature_server, headers_get, [
                            'species', species_id, 'schemas', schema_id,
                            'loci', 'locus', 'update'
                        ])[1]
                    insertion_status = insertion_status.json()
                    if 'message' in insertion_status:
                        status = 'Complete'
                        results = insertion_status['identifiers']

                    current_count = int(insertion_status['nr_alleles'])

                    inserted = current_count - start_count
                    print('\r',
                          '    Inserted {0} alleles.'.format(inserted),
                          end='')
                    time.sleep(2)
                    current_time += 2

                if current_time != time_limit:
                    # determine alleles that were attributed an identifier
                    repeated = sum([len(r[0]) for l, r in results.items()])
                    attributed = sum([len(r[1]) for l, r in results.items()])

                    print(
                        '\nThe Chewie-NS inserted {0} new alleles and detected '
                        '{1} repeated alleles.'.format(attributed, repeated))
                else:
                    print(
                        '\nCould not retrieve allele identifiers assigned by '
                        'Chewie-NS. Will adapt schema with retrieved alleles. '
                        'Please repeat the syncing process in order to assign '
                        'the new identifiers for the submitted alleles.')

                # remove files in temp folder
                fo.remove_files(length_files)
                fo.remove_files(alleles_files)
                fo.remove_files(zipped_files)

    # change pickled files to FASTA files
    for locus, pick in pickled_loci.items():
        rearranged = pickle_to_fasta(locus, pick, temp_dir, results,
                                     rearranged)

    # change identifiers in SQLite DB
    if len(rearranged) > 0 and update_profiles is True:
        print('\nUpdating local allele identifiers...')
        altered = ps.update_profiles(schema_directory, rearranged)
        if altered is not None:
            print('Updated {0} profiles.\n'.format(altered))
        else:
            print('Could not find local SQLite database to upload profiles.\n')

    # Re-determine the representative sequences
    if attributed > 0 or count > 0:
        PrepExternalSchema.main(temp_dir, schema_directory, cpu_cores,
                                float(schema_params['bsr'][0]),
                                int(schema_params['minimum_locus_length'][0]),
                                11, '', None, blast_path)

        # delete invalid alleles and genes files
        parent_dir = os.path.dirname(schema_directory)
        files = [
            os.path.join(parent_dir, file) for file in os.listdir(parent_dir)
            if 'invalid' in file
        ]

        fo.remove_files(files)

        # get last modification date
        last_modified = cr.simple_get_request(
            nomenclature_server, headers_get,
            ['species', species_id, 'schemas', schema_id, 'modified'])[1]
        last_modified = (last_modified.json()).split(' ')[-1]
        server_time = last_modified

        # update NS config file with latest server time
        ns_configs = os.path.join(schema_directory, '.ns_config')
        fo.pickle_dumper([server_time, schema_uri], ns_configs)

    print('Received {0} new alleles for {1} loci and sent '
          '{2} for {3} loci. '.format(count, len(pickled_loci), attributed,
                                      len(not_in_ns)))

    # delete temp directory
    shutil.rmtree(temp_dir)

    # delete pre-computed BSR values from 'short' directory
    # representatives might have changed and BSR values are outdated
    short_dir = os.path.join(schema_directory, 'short')
    bsr_files = [
        os.path.join(short_dir, f) for f in os.listdir(short_dir)
        if f.endswith('_bsr.txt')
    ]
    fo.remove_files(bsr_files)
Esempio n. 5
0
def pickle_to_fasta(locus, pickled_file, temp_dir, identifiers, reassigned):
    """ Creates FASTA files with the information contained in
        a pickled file.

        Parameters
        ----------
        locus : str
            The identifier of the locus with '.fasta' suffix.
        pickled_file : str
            Path to the pickled file with a dictionary that
            has integer identifiers as keys and a tuple with
            two elements: the identifier that should be assigned
            to the allele (might differ from the key if the allele
            is new, in which case it starts with '*') and the DNA
            sequence of the allele.
        temp_dir : str
            Path to the directory where the output FASTA file will
            be created.
        identifiers : dict
            The `zip_res` variable returned by the
            :py:func:`upload_alleles_data` function. It will be used
            to change allele identifiers that were successfully
            inserted into the Chewie-NS.

        Returns
        -------
        fasta_path : str
            Path to the FASTA file created by this function
    """

    locus_id = locus.rstrip('.fasta')
    locus_int = locus_id.split('-')[-1].lstrip('0')
    if locus_int in identifiers:
        repeated = identifiers[locus_int][0]
        attributed = identifiers[locus_int][1]
    else:
        repeated = {}
        attributed = {}

    inv_reassigned = {}
    if locus in reassigned:
        inv_reassigned = {v: k for k, v in reassigned[locus].items()}

    locus_sequences = fo.pickle_loader(pickled_file)

    natsorted_locus = sorted(locus_sequences)

    fasta_path = os.path.join(temp_dir, locus)
    records = []
    for seqid in natsorted_locus:
        recid = locus_sequences[seqid][0]
        seq = locus_sequences[seqid][1]
        seq_hash = hashlib.sha256(seq.encode('utf-8')).hexdigest()
        # switch by the identifier attributed by the Chewie-NS
        if seq_hash in attributed:
            new_recid = attributed[seq_hash]
            if recid in inv_reassigned:
                old_id = inv_reassigned[recid]
                reassigned[locus][old_id] = new_recid
            else:
                if locus not in reassigned:
                    reassigned[locus] = {recid: new_recid}
                else:
                    reassigned[locus][recid] = new_recid
            recid = new_recid
        elif seq_hash in repeated:
            new_recid = repeated[seq_hash]
            if recid in inv_reassigned:
                old_id = inv_reassigned[recid]
                reassigned[locus][old_id] = new_recid
            else:
                if locus not in reassigned:
                    reassigned[locus] = {recid: new_recid}
                else:
                    reassigned[locus][recid] = new_recid
            recid = new_recid

        record = '>{0}_{1}\n{2}'.format(locus_id, recid, seq)
        records.append(record)

    fasta_text = '\n'.join(records)

    with open(fasta_path, 'w') as fp:
        fp.write(fasta_text)

    os.remove(pickled_file)

    return reassigned
Esempio n. 6
0
def upload_alleles_data(alleles_data, length_files, base_url, headers_post,
                        headers_post_bytes, species_id, schema_id):
    """ Uploads files with the data to insert alleles and the
        length values for the sequences of each locus.

        Parameters
        ----------
        alleles_data : list
            List with tuples, one per locus, that contain the path
            to the ZIP archive with the data to insert alleles,
            the identifier of the locus, the locus file hash and
            the basename of the locus file.
        length_files : list
            List with paths to the pickled files that contain a
            dictionary with sequences hashes as keys and sequence
            length as values.
        base_url : str
            Base URL of the Nomenclature server.
        headers_post : dict
            HTTP headers for POST requests that accept JSON
            formatted data.
        headers_post_bytes : dict
            HTTP headers for POST requests that support file
            upload.
        species_id : int
            The identifier of the schema's species in the NS.
        schema_id : int
            The identifier of the schema in the NS.

        Returns
        -------
        failed : list of str
            List with the identifiers of the loci whose alleles
            data could not be fully uploaded.
        zip_res : dict
            A dictionary with the response returned by the last
            POST method. It has loci identifiers as keys and
            lists with two dictionaries as values (the dictionaries
            have sequences hashes as keys and sequence identifiers in
            the Chewie-NS as values. The first dictionary has the hashes
            of the sequences that were sent to the Chewie-NS but that were
            already present in the loci and the identifiers of those repeated
            alleles that were sent to the Chewie-NS. The second dictionary
            has the same structure but for the sequences that were accepted and
            inserted into each locus).
    """

    uploaded = 0
    failed = []
    for i, a in enumerate(alleles_data):

        locus_id = a[1]

        # get length of alleles from current locus
        current_len = length_files[i]
        data = fo.pickle_loader(current_len)
        data = {locus_id: data[next(iter(data))]}
        data = {'content': data}

        # send data to the NS
        send_url = cr.make_url(base_url, 'species', species_id, 'schemas',
                               schema_id, 'loci', locus_id, 'lengths')

        lengths_res = cr.simple_post_request(send_url,
                                             headers_post,
                                             data=json.dumps(data))[1]
        length_status = lengths_res.status_code

        # get path to ZIP archive with data to insert alleles
        current_zip = a[0]

        # send data to insert alleles in the NS
        zip_url = cr.make_url(base_url, 'species', species_id, 'schemas',
                              schema_id, 'loci', locus_id, 'update')

        if alleles_data[i] == alleles_data[-1]:
            headers_post_bytes['complete'] = 'True'

        zip_res = cr.upload_file(current_zip, os.path.basename(current_zip),
                                 zip_url, headers_post_bytes, False)

        # determine if upload was successful
        zip_status = zip_res.status_code

        # determine if upload was successful
        if length_status not in [200, 201] or zip_status not in [200, 201]:
            failed.append(locus_id)
        elif length_status in [200, 201] and zip_status in [200, 201]:
            uploaded += 1
            print('\r',
                  '    Sent data for alleles of '
                  '{0}/{1} loci.'.format(uploaded, len(alleles_data)),
                  end='')

    return [failed, zip_res]