import os import pandas as pd from bs4 import BeautifulSoup from Source.mtDNA.tibet.functions.file_system import get_path path_in = get_path() path_in += '/Data/phylotree/html/' path_out = get_path() path_out += '/Data/phylotree/xlsx/' phylotrees = {'tree_name': [], 'tree': []} for curr_file in os.listdir(path_in): with open(path_in + curr_file, "r") as f: contents = f.read() soup = BeautifulSoup(contents, 'lxml') table = soup.find('table') table_rows = table.find_all('tr') table_rows_list = [] for tr in table_rows: td = tr.find_all('td') row = [i.text for i in td] table_rows_list.append(row) table_rows_list = [row for row in table_rows_list if len(set(row)) > 1] tree_name_index = table_rows_list[0][1].find('subtree') + len( 'subtree ') tree_name = table_rows_list[0][1][tree_name_index:] phylotrees['tree_name'].append(tree_name) table_rows_list = table_rows_list[16:] table_rows_list = [[elem.replace('\xa0', ' ') for elem in table_row]
from Source.mtDNA.tibet.functions.file_system import get_path from Source.mtDNA.high_altitude.functions import * from Source.mtDNA.high_altitude.infrastructure_functions import * read_tibet_data = 1 path = get_path() info_data_path = path + '/Data/alignment/info/' tibet_data_path = path + '/Data/alignment/tibet/' world_data_path = path + '/Data/alignment/world/' tibet_result_path = path + '/Result/tibet_pair/' if not os.path.exists(tibet_result_path): os.makedirs(tibet_result_path) world_result_path = path + '/Result/world_pair/' if not os.path.exists(world_result_path): os.makedirs(world_result_path) tibet_data, tibet_subjects, tibet_classes = read_data(tibet_data_path) regions = get_region_info(info_data_path) current_tibet_classes = { 'Asian Low Altitude': [ '0-500', '501-1000', '1001-1500', '1501-2000', '2001-2500', '2501-3000', '3001-4000' ], 'Tibetan High Altitude': ['4001'] } tibet_subset, tibet_subject_classes = subset_subjects(tibet_data, tibet_classes,
from Source.mtDNA.tibet.functions.file_system import get_path import pandas as pd import os subject_info = {'subject': [], 'group': [], 'height': []} data_path = get_path() + '/Data/' for filename in os.listdir(data_path): if filename.endswith('.fasta'): f = open(data_path + filename, 'r') subjects_lines = [line.rstrip() for line in f][0::2] for subject_line in subjects_lines: line_list = subject_line.split(' ') subject_info['subject'].append(line_list[0][1:]) group_index = line_list.index('mitochondrion,') - 1 subject_info['group'].append(line_list[group_index]) subject_info['height'].append(filename[:-6]) f.close() df = pd.DataFrame(subject_info) writer = pd.ExcelWriter(data_path + 'subjects.xlsx', engine='xlsxwriter') df.to_excel(writer, index=False, startrow=0) worksheet = writer.sheets['Sheet1'] writer.save()