Example #1
0
 def check_format(self):
     """
     检测文件是否满足要求,发生错误时应该触发FileError异常
     :return: bool
     """
     self.is_gz = self.is_gzip
     if self.is_gz:
         with gzip.open(self.prop['path'], 'rb') as f:
             line1 = f.next()
             line = f.next()
             line = f.next()
             line = f.next()
             line5 = f.next()
             if not (re.search(r'^@', line1) and re.search(r'^@', line5)):
                 raise FileError("非压缩后的fastq格式文件")
             myline1 = re.split('_', line1)
             myline2 = re.split('_', line5)
             if len(myline1) > 1 and len(myline2) > 1:
                 self.has_sample_info = True
     else:
         with open(self.prop['path'], 'r') as r:
             line = r.next()
             if not re.search(r'^@', line):
                 raise FileError("fastq文件格式错误")
             myline1 = re.split('_', line)
             line = r.next()
             line = r.next()
             line = r.next()
             line = r.next()
             if not re.search(r'^@', line):
                 raise FileError("fastq文件格式错误")
             myline2 = re.split('_', line)
             if len(myline1) > 1 and len(myline2) > 1:
                 self.has_sample_info = True
     return True
Example #2
0
 def check(self):
     """
      检测文件是否满足要求,发生错误时应该触发FileError异常
     :return:
     """
     if super(HdrsFile, self).check():
         if not self.check_line():
             raise FileError("文件格式错误")
         if self._has_fa:
             if not self.check_consistence():
                 raise FileError("hdrs文件({})和他的fasta文件({})内容上不一致".format(self.path, self.fasta.path))
     return True
 def check_fa_index_coherence(self):
     index_file_set = set(os.listdir(self.ref_seq_fa_index))
     for fa_name in os.listdir(self.ref_single_seq_fa):
         m_fa = re.match(r'^(\S+\.fa)$', fa_name)
         if m_fa:
             seq_name = m_fa.group(1)
             ebwt1 = '{}.1.ebwt'.format(seq_name)
             ebwt2 = '{}.2.ebwt'.format(seq_name)
             ebwt3 = '{}.3.ebwt'.format(seq_name)
             ebwt4 = '{}.4.ebwt'.format(seq_name)
             rev_ebwt1 = '{}.rev.1.ebwt'.format(seq_name)
             rev_ebwt2 = '{}.rev.2.ebwt'.format(seq_name)
             condition = (ebwt1 in index_file_set) and (
                 ebwt2 in index_file_set
             ) and (ebwt3
                    in index_file_set) and (ebwt4 in index_file_set) and (
                        rev_ebwt1 in index_file_set) and (rev_ebwt2
                                                          in index_file_set)
             if not condition:
                 raise FileError(
                     'file {} in the fasta dir {} does not have legal(enough) index files'
                 )
         else:
             raise RuntimeError(
                 'the file name {} in path {} is an illegal single seq fasta file name!'
                 .format(fa_name, self.ref_single_seq_fa))
     self.step.add_steps('mapsplice_map')
     self.step.mapsplice_map.start()
     self.step.update()
Example #4
0
 def is_gzip(self):
     """
     依据文件后缀名检测是是gz个是压缩文件
     """
     if re.search('\.tar\.gz$', self.prop['path']):
         raise FileError("不支持tar.gz格式文件")
     if re.search('\.gz$', self.prop['path']) or re.search(
             '\.gzip$', self.prop['path']):
         return True
     else:
         return False
Example #5
0
 def _prepare(self, work_path):
     """
     为获取序列的信息做准备
     生成临时文件夹,当输入的文件是gz格式时,解压到tmp里
     """
     self.unzipfile = self.prop['path']
     basename = os.path.basename(self.prop['path'])
     self.fastaname = os.path.join(work_path, basename + ".fasta")
     if self.is_gz:
         basename = re.search(r'(.+)\.gz', basename).group(1)
         self.unzipfile = os.path.join(work_path, basename)
         try:
             subprocess.check_call('gunzip -c ' + self.prop['path'] + "> " +
                                   self.unzipfile)
         except subprocess.CalledProcessError:
             raise FileError("非标准格式的gz文件!")
Example #6
0
    def check_line(self):
        """
        检查每行是否满足hdrs的语法要求
        :return:
        """
        if super(HdrsFile, self).check():
            try:

                with open(self.path) as fr:
                    line = fr.readline().strip()
                    m = re.match(r">(\S+)\s+/len=(\d+)\s+/nonNlen=(\d+)\s+/org=(\S+)", line)
                    if not m:
                        raise FileError("hdrs文件({})的这一行:{}不满足要求".format(self.path, line))
            except Exception as e:
                raise Exception("打开hdrs文件运行出错: {}".format(e))
        return True
Example #7
0
    def check_line_format(self):
        '''

        :return: hdrs文件里每行注释的id集合
        '''

        hdrs_abs_path = ""
        co_fa_path = ""
        id_set = set()
        try:
            with open(hdrs_abs_path) as hdrs:
                line = hdrs.readline()
                m = re.match(r'>(\S+)\s+/len=\d+\s+/nonNlen=\d+\s+/org=\S+',line.strip())
                id = m.group(1)
                id_set.add(id)
        except Exception as e:
                raise FileError("{}的hdrs文件:{}不符合要求".format(co_fa_path,hdrs_abs_path))
        return id_set
Example #8
0
    def write_hdrs(self):
        '''
        生成fasta文件的.hdrs文件: 每一行的格式为">seq_id len=seq_length /nonNlen=non_N_seq_length  /org=org_name"
        供asprofile使用
        :param hdrs_file_path:
        :return: hdrs_file_path
        author: linfang.jin
        date:2017.01.15
        '''

        try:
            fw = open(self.path, 'w')
            org_name = os.path.basename(self.fasta.path).strip().split(".")[0]
            for seq in SeqIO.parse(self.fasta.path, "fasta"):
                seq_id = seq.id
                seq_len = len(seq.seq)
                nonNlen = seq_len - seq.seq.count("N") - seq.seq.count("n")
                newline = ">{} len={} /nonNlen={}  /org={}\n".format(str(seq_id), str(seq_len), str(nonNlen), org_name)
                fw.write(newline)
            fw.close()
        except Exception as e:
            raise FileError("{}:生成{}的hdrs文件({})过程中出错".format(e, self.fasta.path, self.path))
Example #9
0
 def set_fasta(self, fasta_obj):
     if not isinstance(fasta_obj, FastaFile):
         raise FileError("传入的文件不是FastaFile对象")
     self._properties['co_fasta_file'] = fasta_obj
     self._has_fa =True
     return self