def get_all_metrics(): sequences = get_list('InitialProtein') first_ids = [] second_ids = [] first_lengths = [] second_lengths = [] all_identity = [] all_gaps = [] for sequence1, sequence2 in it.combinations(sequences, 2): filename = get_pairwise_filename(sequence1, sequence2, aligned=True) alignment_metrics = get_metrics_from_filename(filename) first_ids.append(sequence1.id) second_ids.append(sequence2.id) first_lengths.append(alignment_metrics['first_length']) second_lengths.append(alignment_metrics['second_length']) all_identity.append(alignment_metrics['percent_identity']) all_gaps.append(alignment_metrics['gap_fraction']) metrics_df = pd.DataFrame({ 'id1': first_ids, 'id2': second_ids, 'length1': first_lengths, 'length2': second_lengths, 'identity': all_identity, 'gaps': all_gaps }) metrics_df['status'] = 'okay' isoforms = (metrics_df.identity == 1) & (metrics_df.gaps > 0) duplicates = (metrics_df.identity == 1) & (metrics_df.gaps == 0) metrics_df.loc[isoforms, 'status'] = 'isoform' metrics_df.loc[duplicates, 'status'] = 'duplicate' return metrics_df
def make_sequence_files(): print('Splitting protein sequences into pairwise files...') sequences = get_list('InitialProtein') for sequence1, sequence2 in it.combinations(sequences, 2): pairwise_filename = get_pairwise_filename(sequence1, sequence2) pairwise_path = os.path.join('data', 'pairwise', pairwise_filename) with open(pairwise_path, 'w') as output_file: SeqIO.write([sequence1, sequence2], output_file, 'fasta')
def process(self, input): """ 统计专利的合作数据, 作为Y year + univ + 和别人合作数 """ college_dir = input['college_dir'] college_list = get_list(college_dir) if not os.path.exists('data.patent_pair'): os.system('cat ' + input['patent_dir'] +'/* > data.patent_pair') patent_dict = {} ipc_hot_dict = {} for line in open('data.patent_pair'): items = line.strip().split('\t') info = dict(zip(self.title_list, items)) open_id = info['公开(公告)号'] patent_type = 'A' if open_id.endswith('U'): patent_type = 'U' apply_year = info['申请日'][:4] apply_person = info['申请人'] hit_college = [] # 可能多个 for college in college_list: if college in apply_person: hit_college.append(college) # for college in hit_college: key = apply_year + '\t' + college if key not in patent_dict: patent_dict[key] = {'A': 0, 'U': 0} patent_dict[key][patent_type] += 1 # ### 统计热门IPC类别 # if '2012'<= apply_year <= '2018': # ipc = info['主分类号'][:3] # if ipc not in ipc_hot_dict: # ipc_hot_dict[ipc] = 0 # ipc_hot_dict[ipc] += 1 # ## # ff = open('res_ipc_hot1.csv', 'w') # for ipc in ipc_hot_dict: # ff.write(ipc + '\t' + str(ipc_hot_dict[ipc]) + '\n') # ff.close() return {'yy_dict': patent_dict, 'college_list': college_list }
def single_linkage_clustering(threshold, variable='both'): sequences = get_list('InitialProtein') sequence_ids = [sequence.id for sequence in sequences] number_of_sequences = len(sequences) metrics = pairwise.get_all_metrics() if variable == 'both': correct_identity = metrics.identity > threshold['identity'] correct_gaps = metrics.gaps < threshold['gaps'] metrics['cluster_variable'] = correct_identity & correct_gaps elif variable == 'identity': metrics['cluster_variable'] = metrics.identity > threshold current_assignment = np.arange(number_of_sequences) for _, row in metrics[metrics['cluster_variable']].iterrows(): index1 = sequence_ids.index(row.id1) index2 = sequence_ids.index(row.id2) assignment1 = current_assignment[index1] assignment2 = current_assignment[index2] current_assignment[current_assignment==assignment1] = assignment2 return current_assignment
def cluster_and_align(threshold): directory_path = os.path.join('data', 'clusters') clusters = single_linkage_clustering(threshold) sequences = get_list('InitialProtein') count = 0 for cluster in set(clusters): indices = np.arange(len(clusters))[clusters==cluster] if len(indices) > 1: filename = 'cluster_%d.fasta' % count unaligned_path = os.path.join(directory_path, filename) cluster_sequences = [] for index in indices: cluster_sequences.append(sequences[index]) with open(unaligned_path, 'w') as output_file: SeqIO.write(cluster_sequences, output_file, 'fasta') aligned_filename = 'cluster_%d__ALIGNED.fasta' % count aligned_path = os.path.join(directory_path, aligned_filename) alignment_command = 'mafft %s > %s' % (unaligned_path, aligned_path) subprocess.call(alignment_command, shell=True) count += 1
from tools import create_file, create_directory, get_list, delete_file, copy_file, save_info, change_dir from game import game print('Hello. File manager is ready to work.') command = None save_info('Programm started') while command != 'exit': command = input( 'Enter command or exit to end work. Enter help to list commands: ') if command == 'list': get_list() elif command == '0': name = input('Enter new path: ') change_dir(name) elif command == '1': name = input('Enter file name: ') if name == '': print('File name is missing') save_info('Error - File name is missing') else: create_file(name) save_info(f'File {name} is created') elif command == '2': name = input('Enter folder name: ') if name == '': print('Folder name is missing')
def setUp(self): self.nucleotide_records = get_list('InitialNucleotide') self.nucleotide_dictionary = get_dictionary('InitialNucleotide') self.protein_dictionary = get_dictionary('InitialProtein')