def check_format(self): """ 检测文件是否满足要求,发生错误时应该触发FileError异常 :return: bool """ self.is_gz = self.is_gzip if self.is_gz: with gzip.open(self.prop['path'], 'rb') as f: line1 = f.next() line = f.next() line = f.next() line = f.next() line5 = f.next() if not (re.search(r'^@', line1) and re.search(r'^@', line5)): raise FileError("非压缩后的fastq格式文件") myline1 = re.split('_', line1) myline2 = re.split('_', line5) if len(myline1) > 1 and len(myline2) > 1: self.has_sample_info = True else: with open(self.prop['path'], 'r') as r: line = r.next() if not re.search(r'^@', line): raise FileError("fastq文件格式错误") myline1 = re.split('_', line) line = r.next() line = r.next() line = r.next() line = r.next() if not re.search(r'^@', line): raise FileError("fastq文件格式错误") myline2 = re.split('_', line) if len(myline1) > 1 and len(myline2) > 1: self.has_sample_info = True return True
def check(self): """ 检测文件是否满足要求,发生错误时应该触发FileError异常 :return: """ if super(HdrsFile, self).check(): if not self.check_line(): raise FileError("文件格式错误") if self._has_fa: if not self.check_consistence(): raise FileError("hdrs文件({})和他的fasta文件({})内容上不一致".format(self.path, self.fasta.path)) return True
def check_fa_index_coherence(self): index_file_set = set(os.listdir(self.ref_seq_fa_index)) for fa_name in os.listdir(self.ref_single_seq_fa): m_fa = re.match(r'^(\S+\.fa)$', fa_name) if m_fa: seq_name = m_fa.group(1) ebwt1 = '{}.1.ebwt'.format(seq_name) ebwt2 = '{}.2.ebwt'.format(seq_name) ebwt3 = '{}.3.ebwt'.format(seq_name) ebwt4 = '{}.4.ebwt'.format(seq_name) rev_ebwt1 = '{}.rev.1.ebwt'.format(seq_name) rev_ebwt2 = '{}.rev.2.ebwt'.format(seq_name) condition = (ebwt1 in index_file_set) and ( ebwt2 in index_file_set ) and (ebwt3 in index_file_set) and (ebwt4 in index_file_set) and ( rev_ebwt1 in index_file_set) and (rev_ebwt2 in index_file_set) if not condition: raise FileError( 'file {} in the fasta dir {} does not have legal(enough) index files' ) else: raise RuntimeError( 'the file name {} in path {} is an illegal single seq fasta file name!' .format(fa_name, self.ref_single_seq_fa)) self.step.add_steps('mapsplice_map') self.step.mapsplice_map.start() self.step.update()
def is_gzip(self): """ 依据文件后缀名检测是是gz个是压缩文件 """ if re.search('\.tar\.gz$', self.prop['path']): raise FileError("不支持tar.gz格式文件") if re.search('\.gz$', self.prop['path']) or re.search( '\.gzip$', self.prop['path']): return True else: return False
def _prepare(self, work_path): """ 为获取序列的信息做准备 生成临时文件夹,当输入的文件是gz格式时,解压到tmp里 """ self.unzipfile = self.prop['path'] basename = os.path.basename(self.prop['path']) self.fastaname = os.path.join(work_path, basename + ".fasta") if self.is_gz: basename = re.search(r'(.+)\.gz', basename).group(1) self.unzipfile = os.path.join(work_path, basename) try: subprocess.check_call('gunzip -c ' + self.prop['path'] + "> " + self.unzipfile) except subprocess.CalledProcessError: raise FileError("非标准格式的gz文件!")
def check_line(self): """ 检查每行是否满足hdrs的语法要求 :return: """ if super(HdrsFile, self).check(): try: with open(self.path) as fr: line = fr.readline().strip() m = re.match(r">(\S+)\s+/len=(\d+)\s+/nonNlen=(\d+)\s+/org=(\S+)", line) if not m: raise FileError("hdrs文件({})的这一行:{}不满足要求".format(self.path, line)) except Exception as e: raise Exception("打开hdrs文件运行出错: {}".format(e)) return True
def check_line_format(self): ''' :return: hdrs文件里每行注释的id集合 ''' hdrs_abs_path = "" co_fa_path = "" id_set = set() try: with open(hdrs_abs_path) as hdrs: line = hdrs.readline() m = re.match(r'>(\S+)\s+/len=\d+\s+/nonNlen=\d+\s+/org=\S+',line.strip()) id = m.group(1) id_set.add(id) except Exception as e: raise FileError("{}的hdrs文件:{}不符合要求".format(co_fa_path,hdrs_abs_path)) return id_set
def write_hdrs(self): ''' 生成fasta文件的.hdrs文件: 每一行的格式为">seq_id len=seq_length /nonNlen=non_N_seq_length /org=org_name" 供asprofile使用 :param hdrs_file_path: :return: hdrs_file_path author: linfang.jin date:2017.01.15 ''' try: fw = open(self.path, 'w') org_name = os.path.basename(self.fasta.path).strip().split(".")[0] for seq in SeqIO.parse(self.fasta.path, "fasta"): seq_id = seq.id seq_len = len(seq.seq) nonNlen = seq_len - seq.seq.count("N") - seq.seq.count("n") newline = ">{} len={} /nonNlen={} /org={}\n".format(str(seq_id), str(seq_len), str(nonNlen), org_name) fw.write(newline) fw.close() except Exception as e: raise FileError("{}:生成{}的hdrs文件({})过程中出错".format(e, self.fasta.path, self.path))
def set_fasta(self, fasta_obj): if not isinstance(fasta_obj, FastaFile): raise FileError("传入的文件不是FastaFile对象") self._properties['co_fasta_file'] = fasta_obj self._has_fa =True return self