def add_data_source(self, data_type, data_file, mapping_f=None):

        # for now, only fasta files are handled as data_path, no dirs or zips
        # with data and correspondig mapping_files.

        # for now, it is also required that there is a data item for each
        # object for which we have an id.

        # read sequences from fasta file
        try:
            seqs = [s for s in file_io.read_fasta(data_file)]
            ids = [s[0] for s in seqs]
        except Exception as e:
            # TODO
            print '\n%s\n%s\n%s\n' % (e, type(e), e.args)
            return 'Error in fasta file.'

        # close the temporary file, not sure if this is neccesary
        data_file.close()

        if not(len(set(ids)) == len(ids)):
            return 'Fasta file contains duplicate ids.'

        fe = self.get_feature_extraction()

        if not(set(fe.fm_protein.object_ids) == set(ids)):
            return 'Ids in provided file do not correspond to ids in project.'

        # reorder the sequence to the project object ids
        seq_dict = dict(seqs)
        seqs = [(sid, seq_dict[sid]) for sid in fe.fm_protein.object_ids]

        # create a uni_orf_mapping???

        try:
            fe.protein_data_set.set_data_source(data_type, seqs)
        except ValueError as e:
            print(traceback.format_exc())
            return str(e)

        # save feature extraction, if it all went well
        fe.save()

        return ''
Example #2
0
    def add_data_source(self, data_type, data_file, mapping_f=None):

        # for now, only fasta files are handled as data_path, no dirs or zips
        # with data and correspondig mapping_files.

        # for now, it is also required that there is a data item for each
        # object for which we have an id.

        # read sequences from fasta file
        try:
            seqs = [s for s in file_io.read_fasta(data_file)]
            ids = [s[0] for s in seqs]
        except Exception as e:
            # TODO
            print '\n%s\n%s\n%s\n' % (e, type(e), e.args)
            return 'Error in fasta file.'

        # close the temporary file, not sure if this is neccesary
        data_file.close()

        if not (len(set(ids)) == len(ids)):
            return 'Fasta file contains duplicate ids.'

        fe = self.get_feature_extraction()

        if not (set(fe.fm_protein.object_ids) == set(ids)):
            return 'Ids in provided file do not correspond to ids in project.'

        # reorder the sequence to the project object ids
        seq_dict = dict(seqs)
        seqs = [(sid, seq_dict[sid]) for sid in fe.fm_protein.object_ids]

        # create a uni_orf_mapping???

        try:
            fe.protein_data_set.set_data_source(data_type, seqs)
        except ValueError as e:
            print(traceback.format_exc())
            return str(e)

        # save feature extraction, if it all went well
        fe.save()

        return ''
    def start_example_project(self, project_id, fasta_f, seq_type, labeling_f):
        ''' Start new project without checking input data
        '''

        self.set_project(project_id)

        # check if user allready has a dir and create one if needed
        if not os.path.exists(self.user_dir):
            os.mkdir(self.user_dir)

        # check if a project with the same name exists, otherwise add number
        if(os.path.exists(self.project_dir)):
            index = 0
            while(os.path.exists(self.project_dir)):
                project_id = '%s_%i' % (self.project_id.split('_')[0], index)
                self.set_project(project_id)
                index += 1

        # read data from file
        try:
            seqs = [s for s in file_io.read_fasta(fasta_f)]
            ids = [s[0] for s in seqs]
        except Exception as e:
            print e
            return 'Error in fasta file'

        # create sequence feature extraction object
        fe = featext.FeatureExtraction()

        # set protein data
        try:
            fe.set_protein_ids(ids)
            fe.protein_data_set.set_data_source(seq_type, seqs)
            # translate to prot seq if orf provided
            if(seq_type == 'orf_seq'):
                ids = [s[0] for s in seqs]
                prot_seqs = [(sequtil.translate(s[1])) for s in seqs]
                # chop off translated stop codons at terminus
                prot_seqs = [s[:-1] if s[-1] == '*' else s for s in prot_seqs]
                fe.protein_data_set.set_data_source('prot_seq',
                                                    zip(ids, prot_seqs))
        except ValueError as e:
            print(traceback.format_exc())
            return str(e)
        except:
            print(traceback.format_exc())
            return 'Error during initiation new project'

        # add to feature matrix
        try:
            labeling_name = os.path.splitext(os.path.basename(labeling_f))[0]
            fe.fm_protein.add_labeling_from_file(labeling_name, labeling_f)
        except ValueError as e:
            return str(e)

        # create data directory for this project (just to be sure, check again)
        if not(os.path.exists(self.project_dir)):
            os.mkdir(self.project_dir)

            # and create directories to store job status
            os.mkdir(self.job_dir)
            os.mkdir(self.job_waiting_dir)
            os.mkdir(self.job_running_dir)
            os.mkdir(self.job_done_dir)
            os.mkdir(self.job_error_dir)

            # create classification dir
            os.mkdir(self.cl_dir)

        else:
            return 'A project with the same project id allready exists'

        # create project details file
        with open(self.project_details_f, 'w') as fout:
            fout.write('project_id\t%s\n' % (self.project_id))
            fout.write('project_init\t%s\n' % (self.timestamp_str()))

        # store feature extraction data
        fe.set_root_dir(self.fe_dir)
        fe.save()

        return ''
    def start_new_project(self, project_id, fasta_file, sequence_type,
                          reference_taxon=None):
        '''
        TOCHECK: what is fasta_file for type ???
        pre: sequence_type is orf_seq or prot_seq
        pre: user_id is set
        '''

        self.set_project(project_id)

        # check if user allready has a dir and create one if needed
        if not os.path.exists(self.user_dir):
            os.mkdir(self.user_dir)

        # check if a project with the same name exists
        if os.path.exists(self.project_dir):
            return 'A project with the same project id allready exists'

        # download reference fasta file
        ref_seqs = []
        if not(reference_taxon is None):

            # TODO
            if(sequence_type == 'orf_seq'):
                return 'Reference set can only be compared to protein' +\
                       'amino acid sequences, not to ORF sequences.'

            # obtain reference proteome sequences
            ref_f = os.path.join(self.ref_data_dir,
                                 '%i.fsa' % (reference_taxon))
            ref_red_f = os.path.join(self.ref_data_dir,
                                     '%i_reduced.fsa' % (reference_taxon))
            # first check local dir
            if(os.path.exists(ref_red_f)):
                ref_seqs = [s for s in file_io.read_fasta(ref_red_f)]
            elif(os.path.exists(ref_f)):
                ref_seqs = [s for s in file_io.read_fasta(ref_f)]
            # otherwise fetch reference data set
            else:
                pass
                '''
                url = 'http://www.uniprot.org/uniref/' +\
                      '?query=uniprot:(organism:%i+' % (reference_taxon) +\
                      'keyword:181)+identity:0.5&format=fasta'
                response = urllib2.urlopen(url)
                try:
                    ref_seqs = [s for s in file_io.read_fasta(response)]
                except Exception:
                    return 'There appears to be an error in the reference ' +\
                           'data fasta file'

                # check if reference data set is not too large
                max_num_seqs = 15000
                if(len(ref_seqs) > max_num_seqs):
                    # randomly select 15000 sequences
                    indices = random.sample(range(len(ref_seqs)), max_num_seqs)
                    ref_seqs = [ref_seqs[i] for i in indices]
                '''

        # estimate reference data set size
        size = len(ref_seqs) * 285  # estimate 285 bytes per seq

        # check file size
        max_size = 5243000  # bytes (5MB)
        while True:
            data = fasta_file.file.read(8192)
            if(size > max_size):
                return 'Sequence data exeeds the maximum ' +\
                       'allowed size (5MB)'

            if not(data):
                break
            size += len(data)
        if(size > max_size):
            return 'Sequence data exeeds the maximum ' +\
                   'allowed size (5MB)'

        # reset to beginning of fasta file
        fasta_file.file.seek(0)

        # read sequences from fasta file (to obtain object ids...)
        try:
            seqs = [s for s in file_io.read_fasta(fasta_file.file)]
            seqs.extend(ref_seqs)
            ids = [s[0] for s in seqs]
        except Exception as e:
            return str(e) +\
                'Please consult the documentation (<i>file formats</i> ' +\
                'section) to learn more about the FASTA file format.'

        # reset pointer to begin of file
        #fasta_file.file.seek(0)
        # close the temporary file, not sure if this is neccesary
        fasta_file.file.close()

        # create sequence feature extraction object to check input
        fe = featext.FeatureExtraction()
        try:
            fe.set_protein_ids(ids)
            fe.protein_data_set.set_data_source(sequence_type, seqs)
            # translate to prot seq if orf provided
            if(sequence_type == 'orf_seq'):
                ids = [s[0] for s in seqs]
                prot_seqs = [(sequtil.translate(s[1])) for s in seqs]
                # chop off translated stop codons at terminus
                prot_seqs = [s[:-1] if s[-1] == '*' else s for s in prot_seqs]
                fe.protein_data_set.set_data_source('prot_seq',
                                                    zip(ids, prot_seqs))
        except ValueError as e:
            print(traceback.format_exc())
            return str(e)
        except:
            print(traceback.format_exc())
            return 'Error during initiation new project'

        # add labeling in case of added reference set
        if(len(ref_seqs) > 0):
            l = [(s[0], 0) for s in seqs]
            l.extend([(s[0], 1) for s in ref_seqs])
            class_names = ['dataset', 'taxon%i' % (reference_taxon)]
            fe.fm_protein.add_labeling('reference', dict(l), class_names)

        # create data directory for this project (just to be sure, check again)
        if not(os.path.exists(self.project_dir)):
            os.mkdir(self.project_dir)

            # and create directories to store job status
            os.mkdir(self.job_dir)
            os.mkdir(self.job_waiting_dir)
            os.mkdir(self.job_running_dir)
            os.mkdir(self.job_done_dir)
            os.mkdir(self.job_error_dir)

            # create classification dir
            os.mkdir(self.cl_dir)

        else:
            return 'A project with the same project id allready exists'

        # create project details file
        with open(self.project_details_f, 'w') as fout:
            fout.write('project_id\t%s\n' % (self.project_id))
            fout.write('project_init\t%s\n' % (self.timestamp_str()))

        # store feature extraction data
        fe.set_root_dir(self.fe_dir)
        fe.save()

        return ''
Example #5
0
    def start_example_project(self, project_id, fasta_f, seq_type, labeling_f):
        ''' Start new project without checking input data
        '''

        self.set_project(project_id)

        # check if user allready has a dir and create one if needed
        if not os.path.exists(self.user_dir):
            os.mkdir(self.user_dir)

        # check if a project with the same name exists, otherwise add number
        if (os.path.exists(self.project_dir)):
            index = 0
            while (os.path.exists(self.project_dir)):
                project_id = '%s_%i' % (self.project_id.split('_')[0], index)
                self.set_project(project_id)
                index += 1

        # read data from file
        try:
            seqs = [s for s in file_io.read_fasta(fasta_f)]
            ids = [s[0] for s in seqs]
        except Exception as e:
            print e
            return 'Error in fasta file'

        # create sequence feature extraction object
        fe = featext.FeatureExtraction()

        # set protein data
        try:
            fe.set_protein_ids(ids)
            fe.protein_data_set.set_data_source(seq_type, seqs)
            # translate to prot seq if orf provided
            if (seq_type == 'orf_seq'):
                ids = [s[0] for s in seqs]
                prot_seqs = [(sequtil.translate(s[1])) for s in seqs]
                # chop off translated stop codons at terminus
                prot_seqs = [s[:-1] if s[-1] == '*' else s for s in prot_seqs]
                fe.protein_data_set.set_data_source('prot_seq',
                                                    zip(ids, prot_seqs))
        except ValueError as e:
            print(traceback.format_exc())
            return str(e)
        except:
            print(traceback.format_exc())
            return 'Error during initiation new project'

        # add to feature matrix
        try:
            labeling_name = os.path.splitext(os.path.basename(labeling_f))[0]
            fe.fm_protein.add_labeling_from_file(labeling_name, labeling_f)
        except ValueError as e:
            return str(e)

        # create data directory for this project (just to be sure, check again)
        if not (os.path.exists(self.project_dir)):
            os.mkdir(self.project_dir)

            # and create directories to store job status
            os.mkdir(self.job_dir)
            os.mkdir(self.job_waiting_dir)
            os.mkdir(self.job_running_dir)
            os.mkdir(self.job_done_dir)
            os.mkdir(self.job_error_dir)

            # create classification dir
            os.mkdir(self.cl_dir)

        else:
            return 'A project with the same project id allready exists'

        # create project details file
        with open(self.project_details_f, 'w') as fout:
            fout.write('project_id\t%s\n' % (self.project_id))
            fout.write('project_init\t%s\n' % (self.timestamp_str()))

        # store feature extraction data
        fe.set_root_dir(self.fe_dir)
        fe.save()

        return ''
Example #6
0
    def start_new_project(self,
                          project_id,
                          fasta_file,
                          sequence_type,
                          reference_taxon=None):
        '''
        TOCHECK: what is fasta_file for type ???
        pre: sequence_type is orf_seq or prot_seq
        pre: user_id is set
        '''

        self.set_project(project_id)

        # check if user allready has a dir and create one if needed
        if not os.path.exists(self.user_dir):
            os.mkdir(self.user_dir)

        # check if a project with the same name exists
        if os.path.exists(self.project_dir):
            return 'A project with the same project id allready exists'

        # download reference fasta file
        ref_seqs = []
        if not (reference_taxon is None):

            # TODO
            if (sequence_type == 'orf_seq'):
                return 'Reference set can only be compared to protein' +\
                       'amino acid sequences, not to ORF sequences.'

            # obtain reference proteome sequences
            ref_f = os.path.join(self.ref_data_dir,
                                 '%i.fsa' % (reference_taxon))
            ref_red_f = os.path.join(self.ref_data_dir,
                                     '%i_reduced.fsa' % (reference_taxon))
            # first check local dir
            if (os.path.exists(ref_red_f)):
                ref_seqs = [s for s in file_io.read_fasta(ref_red_f)]
            elif (os.path.exists(ref_f)):
                ref_seqs = [s for s in file_io.read_fasta(ref_f)]
            # otherwise fetch reference data set
            else:
                pass
                '''
                url = 'http://www.uniprot.org/uniref/' +\
                      '?query=uniprot:(organism:%i+' % (reference_taxon) +\
                      'keyword:181)+identity:0.5&format=fasta'
                response = urllib2.urlopen(url)
                try:
                    ref_seqs = [s for s in file_io.read_fasta(response)]
                except Exception:
                    return 'There appears to be an error in the reference ' +\
                           'data fasta file'

                # check if reference data set is not too large
                max_num_seqs = 15000
                if(len(ref_seqs) > max_num_seqs):
                    # randomly select 15000 sequences
                    indices = random.sample(range(len(ref_seqs)), max_num_seqs)
                    ref_seqs = [ref_seqs[i] for i in indices]
                '''

        # estimate reference data set size
        size = len(ref_seqs) * 285  # estimate 285 bytes per seq

        # check file size
        max_size = 5243000  # bytes (5MB)
        while True:
            data = fasta_file.file.read(8192)
            if (size > max_size):
                return 'Sequence data exeeds the maximum ' +\
                       'allowed size (5MB)'

            if not (data):
                break
            size += len(data)
        if (size > max_size):
            return 'Sequence data exeeds the maximum ' +\
                   'allowed size (5MB)'

        # reset to beginning of fasta file
        fasta_file.file.seek(0)

        # read sequences from fasta file (to obtain object ids...)
        try:
            seqs = [s for s in file_io.read_fasta(fasta_file.file)]
            seqs.extend(ref_seqs)
            ids = [s[0] for s in seqs]
        except Exception as e:
            return str(e) +\
                'Please consult the documentation (<i>file formats</i> ' +\
                'section) to learn more about the FASTA file format.'

        # reset pointer to begin of file
        #fasta_file.file.seek(0)
        # close the temporary file, not sure if this is neccesary
        fasta_file.file.close()

        # create sequence feature extraction object to check input
        fe = featext.FeatureExtraction()
        try:
            fe.set_protein_ids(ids)
            fe.protein_data_set.set_data_source(sequence_type, seqs)
            # translate to prot seq if orf provided
            if (sequence_type == 'orf_seq'):
                ids = [s[0] for s in seqs]
                prot_seqs = [(sequtil.translate(s[1])) for s in seqs]
                # chop off translated stop codons at terminus
                prot_seqs = [s[:-1] if s[-1] == '*' else s for s in prot_seqs]
                fe.protein_data_set.set_data_source('prot_seq',
                                                    zip(ids, prot_seqs))
        except ValueError as e:
            print(traceback.format_exc())
            return str(e)
        except:
            print(traceback.format_exc())
            return 'Error during initiation new project'

        # add labeling in case of added reference set
        if (len(ref_seqs) > 0):
            l = [(s[0], 0) for s in seqs]
            l.extend([(s[0], 1) for s in ref_seqs])
            class_names = ['dataset', 'taxon%i' % (reference_taxon)]
            fe.fm_protein.add_labeling('reference', dict(l), class_names)

        # create data directory for this project (just to be sure, check again)
        if not (os.path.exists(self.project_dir)):
            os.mkdir(self.project_dir)

            # and create directories to store job status
            os.mkdir(self.job_dir)
            os.mkdir(self.job_waiting_dir)
            os.mkdir(self.job_running_dir)
            os.mkdir(self.job_done_dir)
            os.mkdir(self.job_error_dir)

            # create classification dir
            os.mkdir(self.cl_dir)

        else:
            return 'A project with the same project id allready exists'

        # create project details file
        with open(self.project_details_f, 'w') as fout:
            fout.write('project_id\t%s\n' % (self.project_id))
            fout.write('project_init\t%s\n' % (self.timestamp_str()))

        # store feature extraction data
        fe.set_root_dir(self.fe_dir)
        fe.save()

        return ''