Ejemplo n.º 1
0
def collect_proteomes_and_annotaitons(input_dir):
    proteomes = []
    annotations = []

    files = listdir(input_dir)
    if not files:
        interrupt('Directory contains no files.')

    for f in (join(input_dir, f) for f in files if isfile(join(input_dir, f))):
        if '.' in f and splitext(f)[1] in ['.fasta', '.faa', '.fa', '.fsa']:
            try:
                log.debug('   Checking if %s is fasta.' % f)
                next(SeqIO.parse(f, 'fasta'))
            except ValueError, e:
                pass
            else:
                proteomes.append(f)
                continue

        if '.' in f and splitext(f)[1] in ['.gb', '.genbank', '.gbk']:
            try:
                log.debug('   Checking if %s is genbank.' % f)
                SeqIO.read(f, 'genbank')
            except Exception, e:
                log.debug(str(e) + ', ' + f)
            else:
                annotations.append(f)
Ejemplo n.º 2
0
def collect_proteomes_and_annotaitons(input_dir):
    proteomes = []
    annotations = []

    files = listdir(input_dir)
    if not files:
        interrupt('Directory contains no files.')

    for f in (join(input_dir, f) for f in files if isfile(join(input_dir, f))):
        if '.' in f and splitext(f)[1] in ['.fasta', '.faa', '.fa', '.fsa']:
            try:
                log.debug('   Checking if %s is fasta.' % f)
                next(SeqIO.parse(f, 'fasta'))
            except ValueError, e:
                pass
            else:
                proteomes.append(f)
                continue

        if '.' in f and splitext(f)[1] in ['.gb', '.genbank', '.gbk']:
            try:
                log.debug('   Checking if %s is genbank.' % f)
                SeqIO.read(f, 'genbank')
            except Exception, e:
                log.debug(str(e) + ', ' + f)
            else:
                annotations.append(f)
Ejemplo n.º 3
0
    def run(starting_from_here=False):
        if p.species_list:
            if not test_entrez_conn():
                log.error('   No internet connection: cannot fetch annotations.')
                return 4

            log.debug('   Using species list: ' + str(p.species_list))
            gb_ids = read_list(p.species_list)
            log.debug('species_list: ' + str(gb_ids))
            res = fetch_annotations_species_name_entrez(config.annotations_dir, gb_ids, p.proxy)
            if res != 0: return res
            return make_proteomes(config.annotations_dir, config.proteomes_dir)

        elif p.ids_list:
            if not test_entrez_conn():
                log.error('No internet connection: cannot fetch annotations.')
                return 4

            log.debug('   Using ref ids: ' + str(p.ids_list))
            ref_ids = read_list(p.ids_list)
            res = fetch_annotations_for_ids(config.annotations_dir, ref_ids)
            if res != 0: return res
            return make_proteomes(config.annotations_dir, config.proteomes_dir)

        else:
            proteomes, annotations = [], []

            if p.proteomes:
                proteomes, annotations = collect_proteomes_and_annotaitons(p.proteomes)
                if proteomes == []:
                    interrupt('No fasta found in ' + p.proteomes)

            if p.annotations:
                proteomes, annotations = collect_proteomes_and_annotaitons(p.annotations)
                if annotations == []:
                    interrupt('No gb files found in ' + p.annotations)

            #if not proteomes and not annotations:
            #    interrupt('Directory must contain fasta or genbank files.')
            #
            #if proteomes and annotations:
            #    log.warn('Directory %s contains both fasta and genbank files, using fasta.')

            if annotations:
                if not isdir(config.annotations_dir):
                    mkdir(config.annotations_dir)

                for annotation in annotations:
                    copy(annotation, config.annotations_dir)

                return make_proteomes(config.annotations_dir, config.proteomes_dir)

            elif proteomes:
                if not isdir(config.proteomes_dir):
                    mkdir(config.proteomes_dir)

                if p.download_anno:
                    if not test_entrez_conn():
                        #log.error('   Error: no internet connection, cannot fetch annotations. '
                        #          'You can start without a --no-fetch option, in this case '
                        #          'a reduced version of orthogroups.txt with no annotations will be produced.')
                        #return 1
                        log.error('   Warning: no internet connection, cannot fetch annotations. '
                                  'A reduced version of orthogroups.txt with no annotations will be produced.')
                    else:
                        # ref_ids = [splitext(basename(prot_file))[0] for prot_file in proteomes]
                        # fetch_annotations_for_ids(config.annotations_dir, ref_ids)

                        gb_ids = [splitext(basename(prot_file))[0] for prot_file in proteomes]
                        log.debug('ids_list: ' + str(gb_ids))
                        res = fetch_annotations_for_ids(config.annotations_dir, gb_ids, p.proxy)
                        if res > 0:
                            return res
                        if res == -1:
                            p.download_anno = False

                return adjust_proteomes(proteomes, config.proteomes_dir, p.prot_id_field)
Ejemplo n.º 4
0
    def run(starting_from_here=False):
        if p.species_list:
            if not test_entrez_conn():
                log.error(
                    '   No internet connection: cannot fetch annotations.')
                return 4

            log.debug('   Using species list: ' + str(p.species_list))
            gb_ids = read_list(p.species_list)
            log.debug('species_list: ' + str(gb_ids))
            res = fetch_annotations_species_name_entrez(
                config.annotations_dir, gb_ids, p.proxy)
            if res != 0: return res
            return make_proteomes(config.annotations_dir, config.proteomes_dir)

        elif p.ids_list:
            if not test_entrez_conn():
                log.error('No internet connection: cannot fetch annotations.')
                return 4

            log.debug('   Using ref ids: ' + str(p.ids_list))
            ref_ids = read_list(p.ids_list)
            res = fetch_annotations_for_ids(config.annotations_dir, ref_ids)
            if res != 0: return res
            return make_proteomes(config.annotations_dir, config.proteomes_dir)

        else:
            proteomes, annotations = [], []

            if p.proteomes:
                proteomes, annotations = collect_proteomes_and_annotaitons(
                    p.proteomes)
                if proteomes == []:
                    interrupt('No fasta found in ' + p.proteomes)

            if p.annotations:
                proteomes, annotations = collect_proteomes_and_annotaitons(
                    p.annotations)
                if annotations == []:
                    interrupt('No gb files found in ' + p.annotations)

            #if not proteomes and not annotations:
            #    interrupt('Directory must contain fasta or genbank files.')
            #
            #if proteomes and annotations:
            #    log.warn('Directory %s contains both fasta and genbank files, using fasta.')

            if annotations:
                if not isdir(config.annotations_dir):
                    mkdir(config.annotations_dir)

                for annotation in annotations:
                    copy(annotation, config.annotations_dir)

                return make_proteomes(config.annotations_dir,
                                      config.proteomes_dir)

            elif proteomes:
                if not isdir(config.proteomes_dir):
                    mkdir(config.proteomes_dir)

                if p.download_anno:
                    if not test_entrez_conn():
                        #log.error('   Error: no internet connection, cannot fetch annotations. '
                        #          'You can start without a --no-fetch option, in this case '
                        #          'a reduced version of orthogroups.txt with no annotations will be produced.')
                        #return 1
                        log.error(
                            '   Warning: no internet connection, cannot fetch annotations. '
                            'A reduced version of orthogroups.txt with no annotations will be produced.'
                        )
                    else:
                        # ref_ids = [splitext(basename(prot_file))[0] for prot_file in proteomes]
                        # fetch_annotations_for_ids(config.annotations_dir, ref_ids)

                        gb_ids = [
                            splitext(basename(prot_file))[0]
                            for prot_file in proteomes
                        ]
                        log.debug('ids_list: ' + str(gb_ids))
                        res = fetch_annotations_for_ids(
                            config.annotations_dir, gb_ids, p.proxy)
                        if res > 0:
                            return res
                        if res == -1:
                            p.download_anno = False

                return adjust_proteomes(proteomes, config.proteomes_dir,
                                        p.prot_id_field)