def stego_make_mp3stego(wav_files_path, mp3_files_path, bitrate, start_idx=0, end_idx=10000): if not os.path.exists(wav_files_path): print("The wav files path does not exist.") else: wav_files_list = get_files_list(file_dir=wav_files_path, start_idx=start_idx, end_idx=end_idx) embedding_rates = ["01", "03", "05", "08", "10"] if not os.path.exists(mp3_files_path): os.mkdir(mp3_files_path) for wav_file_path in wav_files_list: for embedding_rate in embedding_rates: file_name = get_file_name(wav_file_path) mp3_file_name = file_name.replace(".wav", ".mp3") mp3_file_path = fullfile(mp3_files_path, mp3_file_name) command = "encode_HCM.exe -b " + bitrate + " -E " + embedding_file_path + "--ER" + embedding_rate + " " + wav_file_path + " " + mp3_file_path eval(command) print( "stego samples are made completely, bitrate %s, stego algorithm %s." % (bitrate, "HCM"))
def calibration(mp3_files_path, calibration_files_path, bitrate, start_idx=0, end_idx=10000): """ mp3 calibration via lame encoder -> lame.exe -b 128 ***.mp3 c_***.mp3 :param mp3_files_path: the mp3 files path :param calibration_files_path: the calibrated mp3 files path :param bitrate: bitrate :param start_idx: start index :param end_idx: end index :return: """ if not os.path.exists(mp3_files_path): print("The mp3 files path does not exist.") else: mp3_files_list = get_files_list(file_dir=mp3_files_path, start_idx=start_idx, end_idx=end_idx) if not os.path.exists(calibration_files_path): os.mkdir(calibration_files_path) for mp3_file_path in mp3_files_list: mp3_file_name = get_file_name(mp3_file_path) calibrated_mp3_file_path = fullfile(calibration_files_path, mp3_file_name) if not os.path.exists(calibrated_mp3_file_path): command = "encode.exe -b " + bitrate + " " + mp3_file_path + " " + calibrated_mp3_file_path os.system(command) else: pass print("calibration with bitrate %s are completed." % bitrate)
def stego_make_acs(wav_files_path, mp3_files_path, bitrate, width, height="7", embed=embedding_file_path, embedding_rate="10", start_idx=None, end_idx=None): """ make stego samples (ACS) :param wav_files_path: path of wav audio files :param mp3_files_path: path of mp3 audio files :param bitrate: bitrate :param width: width of parity-check matrix :param height: height of parity-check matrix, default is "7" :param embed: path of embedding file :param embedding_rate: embedding rate, default is "10" :param start_idx: start index of audio files :param end_idx: end index of audio files :return: NULL """ if not os.path.exists(wav_files_path): print("The wav files path does not exist.") else: wav_files_list = get_files_list(file_dir=wav_files_path, start_idx=start_idx, end_idx=end_idx) if not os.path.exists(mp3_files_path): os.mkdir(mp3_files_path) for wav_file_path in wav_files_list: file_name = get_file_name(wav_file_path) mp3_file_name = file_name.replace(".wav", ".mp3") mp3_file_path = fullfile(mp3_files_path, mp3_file_name) if not os.path.exists(mp3_file_path): command = "C:/Users/Charles_CatKing/Desktop/ACS/lame.exe -b " + bitrate + " -embed " + embed + " -width " + width + " -height " + height + \ " -er " + embedding_rate + " -region 2 -layerii 1 -threshold 2 -key 123456 " + wav_file_path + " " + mp3_file_path os.system(command) else: pass
def cover_make_mp3stego(wav_files_path, mp3_files_path, bitrate, start_idx=0, end_idx=10000): """ make mp3 cover samples via mp3stego encoder :param wav_files_path: path of wav audio files :param mp3_files_path:path of mp3 audio files :param bitrate: bitrate (128, 192, 256, 320) :param start_idx: the start index of audio files to be processed :param end_idx: the end index of audio files to be processed :return: NULL """ if not os.path.exists(wav_files_path): print("The wav files path does not exist.") else: wav_files_list = get_files_list(file_dir=wav_files_path, start_idx=start_idx, end_idx=end_idx) if not os.path.exists(mp3_files_path): os.mkdir(mp3_files_path) for wav_file_path in wav_files_list: file_name = get_file_name(wav_file_path) mp3_file_name = file_name.replace(".wav", ".mp3") mp3_file_path = fullfile(mp3_files_path, mp3_file_name) if not os.path.exists(mp3_file_path): command = "encode_MP3Stego.exe -b " + bitrate + " " + wav_file_path + " " + mp3_file_path os.system(command) else: pass print("MP3Stego cover samples with bitrate %s are completed." % bitrate)
def stego_make_hcm(wav_files_path, mp3_files_path, bitrate, cost="2", embed=embedding_file_path, frame_num="50", embedding_rate="10", start_idx=None, end_idx=None): """ make stego samples (HCM) :param wav_files_path: path of wav audio files :param mp3_files_path: path of mp3 audio files :param bitrate: bitrate :param cost: type of cost function, default is "2" :param embed: path of embedding file :param frame_num: frame number of embedding message, default is "50" :param embedding_rate: embedding rate, default is "10" :param start_idx: start index of audio files :param end_idx: end index of audio files :return: NULL """ if not os.path.exists(wav_files_path): print("The wav files path does not exist.") else: wav_files_list = get_files_list(file_dir=wav_files_path, start_idx=start_idx, end_idx=end_idx) if not os.path.exists(mp3_files_path): os.mkdir(mp3_files_path) for wav_file_path in wav_files_list: file_name = get_file_name(wav_file_path) mp3_file_name = file_name.replace(".wav", ".mp3") mp3_file_path = fullfile(mp3_files_path, mp3_file_name) if not os.path.exists(mp3_file_path): command = "encode_HCM.exe -b " + bitrate + " -embed " + embed + " -cost " + cost + " -er " + embedding_rate \ + " -framenumber " + frame_num + " " + wav_file_path + " " + mp3_file_path os.system(command) else: pass
def stego_make_eecs(wav_files_path, mp3_files_path, bitrate, width, height="7", embed=embedding_file_path, frame_num="50", embedding_rate="10", start_idx=None, end_idx=None): """ make stego samples (EECS) :param wav_files_path: path of wav audio files :param mp3_files_path: path of mp3 audio files :param bitrate: bitrate :param width: width of parity-check matrix :param height: height of parity-check matrix, default is "7" :param embed: path of embedding file :param frame_num: frame number of embedding message, default is "50" :param embedding_rate: embedding rate, default is "10" :param start_idx: start index of audio files :param end_idx: end index of audio files :return: NULL """ if not os.path.exists(wav_files_path): print("The wav files path does not exist.") else: wav_files_list = get_files_list(file_dir=wav_files_path, start_idx=start_idx, end_idx=end_idx) if not os.path.exists(mp3_files_path): os.mkdir(mp3_files_path) for wav_file_path in wav_files_list: file_name = get_file_name(wav_file_path) mp3_file_name = file_name.replace(".wav", ".mp3") mp3_file_path = fullfile(mp3_files_path, mp3_file_name) if not os.path.exists(mp3_file_path): command = "encode_EECS.exe -b " + bitrate + " -embed " + embed + " -width " + width + " -height " + height + " -er " + embedding_rate \ + " -framenumber " + frame_num + " " + wav_file_path + " " + mp3_file_path os.system(command) else: pass
def get_test_files(dir): files = utils.get_files_list(dir) test_files = [] for file in files: res = file.endswith(test_expansion) if res == True: test_files.append(file) return test_files
def stego_make_mp3stego(wav_files_path, mp3_files_path, bitrate, embedding_rate="10", start_idx=None, end_idx=None): """ make stego samples via MP3Stego for 10s wav audio, secret messages of 1528 bits (191 Bytes) will be embedded, and the length of secret messages is independent of bitrate analysis unit: 50 frames (for 10s mp3 audio, there are 384 frames), 24.83 bytes messages will be embedded relative embedding rate secret messages length is_selected 10% 3 Bytes 1 20% 5 Bytes 30% 8 Bytes 1 40% 10 Bytes 50% 13 Bytes 1 60% 14 Bytes 70% 17 Bytes 80% 20 Bytes 1 90% 22 Bytes 100% 24 Bytes 1 in the process of MP3stego, the messages are compressed :param wav_files_path: path of wav audio files :param mp3_files_path:path of mp3 audio files :param bitrate: bitrate (128, 192, 256, 320) :param embedding_rate: embedding rate, default is "10" :param start_idx: the start index of audio files to be processed :param end_idx: the end index of audio files to be processed :return: NULL """ embedding_rates = ["1", "3", "5", "8", "10"] message_lengths = [3, 8, 13, 20, 24] if not os.path.exists(wav_files_path): print("The wav files path does not exist.") else: wav_files_list = get_files_list(file_dir=wav_files_path, file_type="wav", start_idx=start_idx, end_idx=end_idx) if not os.path.exists(mp3_files_path): os.mkdir(mp3_files_path) message_len = message_lengths[embedding_rates.index(embedding_rate)] embedding_file = message_random(embedding_file_path, message_len) for wav_file_path in wav_files_list: file_name = get_file_name(wav_file_path) mp3_file_name = file_name.replace(".wav", ".mp3") mp3_file_path = fullfile(mp3_files_path, mp3_file_name) if not os.path.exists(mp3_file_path): command = "encode_MP3Stego.exe -b " + bitrate + " -E " + embedding_file + " -P pass " + wav_file_path + " " + mp3_file_path os.system(command) else: pass print( "stego samples are made completely, bitrate %s, stego algorithm %s." % (bitrate, "MP3Stego"))
def parse_gold_annotation(self, paths): interaction_words = defaultdict(int) paths = utils.get_files_list(paths) entity_collection = {} interaction_collection = {} for file in paths: document_data = open(file, "r") xml_data = parse(document_data) document_elt = xml_data.getElementsByTagName("document") document_attrs = dict(document_elt[0].attributes.items()) document_id = document_attrs["id"] sentences = xml_data.getElementsByTagName("sentence") for sentence in sentences: entity_collection = {} sentence_attrs = dict(sentence.attributes.items()) text = sentence_attrs["text"] entities = sentence.getElementsByTagName("entity") for entity in entities: entity_attrs = dict(entity.attributes.items()) id = entity_attrs["id"] text = (entity_attrs["text"]).lower() type = (entity_attrs["type"]).lower() entity_collection[id] = {} entity_collection[id] = {"text": text, "type": type} interacting_pairs = sentence.getElementsByTagName("pair") for pair in interacting_pairs: pair_attrs = dict(pair.attributes.items()) type = None if pair_attrs.has_key("type"): type = pair_attrs["type"] if document_id not in interaction_collection.keys(): interaction_collection[document_id] = {} if sentence_attrs["id"] not in interaction_collection[document_id].keys(): interaction_collection[document_id][sentence_attrs["id"]] = {} # if pair_attrs["id"] not in interaction_collection[document_id][sentence_attrs["id"]].keys(): # interaction_collection[document_id][sentence_attrs["id"]][pair_attrs["id"]]={} if pair_attrs["e1"] not in interaction_collection[document_id][sentence_attrs["id"]].keys(): interaction_collection[document_id][sentence_attrs["id"]][pair_attrs["e1"]] = {} if ( pair_attrs["e2"] not in interaction_collection[document_id][sentence_attrs["id"]][pair_attrs["e1"]] ): interaction_collection[document_id][sentence_attrs["id"]][pair_attrs["e1"]][ pair_attrs["e2"] ] = {} interaction_collection[document_id][sentence_attrs["id"]][pair_attrs["e1"]][ pair_attrs["e2"] ] = {"ddi": pair_attrs["ddi"], "type": type} document_data.close() pickle.dump(interaction_collection, open("models/test_data.p", "wb"))
def get_executables(dir): files = utils.get_files_list(dir) rdo_ex = None for file in files: res = file.endswith(rdo_ex_substr) if res == True: rdo_ex = file if rdo_ex: break return rdo_ex
def stego_make_acs(wav_files_path, mp3_files_path, bitrate, width, height="7", embed=embedding_file_path, embedding_rate="10", frame_embedding_rate="10", region="2", threshold="2", start_idx=None, end_idx=None): """ make stego samples (ACS) :param wav_files_path: path of wav audio files :param mp3_files_path: path of mp3 audio files :param bitrate: bitrate :param width: width of parity-check matrix :param height: height of parity-check matrix, default is "7" :param embed: path of embedding file :param embedding_rate: embedding rate, default is "10" :param frame_embedding_rate: embedding rate in a frame, defualt is "10" :param region: embeding region, default is "2", "0": Big-Value region, "1": Count1 region, "2": All regions :param threshold: threshold value for embedding, embedded coefficients are in [-threshold, threshold], default is "2" :param start_idx: start index of audio files :param end_idx: end index of audio files :return: NULL """ if not os.path.exists(wav_files_path): print("The wav files path does not exist.") else: wav_files_list = get_files_list(file_dir=wav_files_path, file_type="wav", start_idx=start_idx, end_idx=end_idx) if not os.path.exists(mp3_files_path): os.mkdir(mp3_files_path) for wav_file_path in wav_files_list: file_name = get_file_name(wav_file_path) mp3_file_name = file_name.replace(".wav", ".mp3") mp3_file_path = fullfile(mp3_files_path, mp3_file_name) if not os.path.exists(mp3_file_path): temp_secret_file_path = message_random(embed) key = random.randint(1000000, 9999999) command = "encode_ACS.exe -b " + bitrate + " -embed " + temp_secret_file_path + " -width " + width + " -height " + height + \ " -er " + embedding_rate + " -fer " + frame_embedding_rate + " -region " + region + " -threshold " + threshold + " -key " + key + \ wav_file_path + " " + mp3_file_path os.system(command) else: pass
def stego_make_mp3stego(wav_files_path, mp3_files_path, bitrate, embedding_rate="10", start_idx=0, end_idx=10000): """ make stego samples via MP3Stego for 10s wav audio, secret messages of 1528 bits (191 Bytes) will be embedded, and the length of secret messages is independent of bitrate analysis unit: 50 frames (for 10s mp3 audio, there are 384 frames), 24.83 bytes messages will be embedded relative embedding rate secret messages length 10% 3 Bytes 30% 8 Bytes 50% 13 Bytes 80% 20 Bytes 100% 24 Bytes in the process of MP3stego, the messages are compressed :param wav_files_path: path of wav audio files :param mp3_files_path:path of mp3 audio files :param bitrate: bitrate (128, 192, 256, 320) :param embedding_rate: embedding rate, default is "10" :param start_idx: the start index of audio files to be processed :param end_idx: the end index of audio files to be processed :return: NULL """ if not os.path.exists(wav_files_path): print("The wav files path does not exist.") else: wav_files_list = get_files_list(file_dir=wav_files_path, start_idx=start_idx, end_idx=end_idx) if not os.path.exists(mp3_files_path): os.mkdir(mp3_files_path) embedding_file_name = "stego_0" + embedding_rate + ".txt" if len( embedding_rate) == 1 else "stego_" + embedding_rate + ".txt" embedding_file = fullfile(embedding_files_mp3stego_path, embedding_file_name) for wav_file_path in wav_files_list: file_name = get_file_name(wav_file_path) mp3_file_name = file_name.replace(".wav", ".mp3") mp3_file_path = fullfile(mp3_files_path, mp3_file_name) if not os.path.exists(mp3_file_path): command = "encode_MP3Stego.exe -b " + bitrate + " -E " + embedding_file + " -P pass " + wav_file_path + " " + mp3_file_path os.system(command) else: pass print( "stego samples are made completely, bitrate %s, stego algorithm %s." % (bitrate, "MP3Stego"))
def delete_model_data(model_path): dir = os.path.dirname(model_path) files = utils.get_files_list(dir) utils.enc_print ('\nFound and deleted model data:') deleted = 0 for file in os.listdir(dir): file_path = os.path.join(dir, file) if (os.path.isfile(file_path)): if (not file_path.endswith(test_expansion) and not file_path.endswith(project_expansion)): utils.enc_print(file_path) os.remove(file_path) deleted = deleted + 1 if deleted == 0: utils.enc_print('nothing deleted') utils.enc_print('\n')
def stego_make_ahcm(wav_files_path, mp3_files_path, bitrate, width, height="7", embed=embedding_file_path, embedding_rate="10", start_idx=None, end_idx=None): """ make stego samples (AHCM) :param wav_files_path: path of wav audio files :param mp3_files_path: path of mp3 audio files :param bitrate: bitrate :param width: width of parity-check matrix :param height: height of parity-check matrix, default is "7" :param embed: path of embedding file :param embedding_rate: embedding rate, default is "10" :param start_idx: start index of audio files :param end_idx: end index of audio files :return: NULL """ if not os.path.exists(wav_files_path): print("The wav files path does not exist.") else: wav_files_list = get_files_list(file_dir=wav_files_path, file_type="wav", start_idx=start_idx, end_idx=end_idx) if not os.path.exists(mp3_files_path): os.mkdir(mp3_files_path) for wav_file_path in wav_files_list: file_name = get_file_name(wav_file_path) mp3_file_name = file_name.replace(".wav", ".mp3") mp3_file_path = fullfile(mp3_files_path, mp3_file_name) if not os.path.exists(mp3_file_path): temp_secret_file_path = message_random(embed) key = random.randint(1000000, 9999999) command = "encode_AHCM.exe -b " + bitrate + " -embed " + temp_secret_file_path + " -width " + width + " -height " + height + \ " -er " + embedding_rate + " -key " + key + \ wav_file_path + " " + mp3_file_path os.system(command) else: pass
def delete_model_data(model_path): dir = os.path.dirname(model_path) files = utils.get_files_list(dir) utils.enc_print('\nFound and deleted model data:') deleted = 0 for file in os.listdir(dir): file_path = os.path.join(dir, file) if (os.path.isfile(file_path)): if (not file_path.endswith(test_expansion) and not file_path.endswith(project_expansion)): utils.enc_print(file_path) os.remove(file_path) deleted = deleted + 1 if deleted == 0: utils.enc_print('nothing deleted') utils.enc_print('\n')
def evaluate_heuristics(paths): interaction_words=defaultdict(int) paths=utils.get_files_list(paths) entity_collection={} interaction_collection={} cue_words=[] with open("ddi_key_phrase") as key_data: for line in key_data: cue_words+=[line.strip()] with open("ddi_trigger") as key_data: for line in key_data: cue_words+=[line.strip()] for file in paths: document_data=open(file,'r') xml_data=parse(document_data) document_elt= xml_data.getElementsByTagName("document") document_attrs=dict(document_elt[0].attributes.items()) document_id=document_attrs["id"] sentences = xml_data.getElementsByTagName("sentence") for sentence in sentences: entity_collection={} sentence_attrs = dict(sentence.attributes.items()) text=sentence_attrs["text"] #for word in cue_words: #if word in text entities = sentence.getElementsByTagName("entity") for entity in entities: entity_attrs = dict(entity.attributes.items()) id=entity_attrs["id"] text=(entity_attrs["text"]).lower() type=(entity_attrs["type"]).lower() entity_collection[id]={} entity_collection[id]={"text":text,"type":type} document_data.close()
def make_seeding_folder(torrent_name, src_dir, dst_dir): with open(torrent_name, 'rb') as fh: torrent_bytes = fh.read() torrent = bencode.decode(torrent_bytes) tor_name = torrent['info']['name'] print(tor_name) dst_dir_1 = os.path.join(dst_dir, tor_name) os.makedirs(name=dst_dir_1, exist_ok=True) prf = Profiler() files = get_files_list(src_dir) prf.log('get_files_list() delay:') files = matching_files_by_size(torrent, files) prf.log('matching_files_by_size() delay:') pieces = make_pieces_from_metadata(torrent, files) prf.log('make_pieces_from_metadata() delay:') for idx in range(len(pieces)): pieces[idx].find_match() # print('Checked ', idx, ' piece from ', len(pieces)) prf.log('find_matches() delay:') for f in files: if f.is_matched(): dst_file = os.path.join(dst_dir, f.get_torrent_name()) src_file = f.get_matched_name() dst_path = os.path.join(dst_dir, f.get_path_from_torrent()) if not os.path.exists(dst_path): os.makedirs(name=dst_path, exist_ok=True) # print(src_file, '=>', dst_file) shutil.copy(src_file, dst_file) prf.log('copy files delay:')
def make_seeding_folder(torrent_name, src_dir, dst_dir): with open(torrent_name, 'rb') as fh: torrent_bytes = fh.read() torrent = bencode.decode(torrent_bytes) tor_name = torrent['info']['name'] print(tor_name) dst_dir_1 = os.path.join(dst_dir, tor_name) os.makedirs(name = dst_dir_1, exist_ok = True) prf = Profiler() files = get_files_list(src_dir) prf.log('get_files_list() delay:') files = matching_files_by_size(torrent, files) prf.log('matching_files_by_size() delay:') pieces = make_pieces_from_metadata(torrent, files) prf.log('make_pieces_from_metadata() delay:') for idx in range(len(pieces)): pieces[idx].find_match() # print('Checked ', idx, ' piece from ', len(pieces)) prf.log('find_matches() delay:') for f in files: if f.is_matched(): dst_file = os.path.join(dst_dir, f.get_torrent_name()) src_file = f.get_matched_name() dst_path = os.path.join(dst_dir, f.get_path_from_torrent()) if not os.path.exists(dst_path): os.makedirs(name = dst_path, exist_ok = True) # print(src_file, '=>', dst_file) shutil.copy(src_file, dst_file) prf.log('copy files delay:')
def check_and_fix_unformatted_logs(apps): for app in apps: try: os.mkdir(os.path.join("logs", "formatted_logs", app)) except FileExistsError: print("folder Exists skipping") files = utils.get_files_list(app, formatted=False) for file in files: with open(file, "r") as f: body = f.read() body = html.fromstring(body) body = remove_scripts_and_style(body) if app == "mchec": body = fix_mchec_error_tables(body) elif app == "pdr": body = fix_pdr_server_tables_and_NA(body) new_file_name = os.path.join(os.getcwd(), "logs", "formatted_logs", app, file.split('\\')[-1]) with open(new_file_name, "w+") as f: f.write(html.tostring(body, pretty_print=True).decode()) return
def text_read_all(text_files_dir, height=200, width=576, separator=","): """ read all txt files into the memory (not recommend) :param text_files_dir: the folder of txt files :param height: the height of QMDCT matrix :param width: the width of QMDCT matrix :param separator: separator of each elements in text file :return: data: QMDCT matrices, ndarry, shape: [files_num, height, width, 1] """ text_files_list = get_files_list(text_files_dir) # get the files list files_num = len(text_files_list) # get the number of files in the folder data = np.zeros([files_num, height, width, 1], dtype=np.float32) i = 0 for text_file in text_files_list: content = text_read(text_file, height, width, separator) data[i] = content i = i + 1 return data
def get_train_corpus_stats(self,paths): paths=utils.get_files_list(paths) for file in paths: self.parse_ddi_corpus(file) unigram_stats=self.get_hash_stats(self.interaction_true_word_count,self.interaction_false_word_count) bigram_stats=self.get_hash_stats(self.interaction_true_bigram_count,self.interaction_false_bigram_count) trigram_stats=self.get_hash_stats(self.interaction_true_trigram_count,self.interaction_false_trigram_count) pickle.dump(unigram_stats,open("models/train_unigram_stats.p",'wb')) pickle.dump(bigram_stats,open("models/train_bigram_stats.p",'wb')) pickle.dump(trigram_stats,open("models/train_trigram_stats.p",'wb')) pickle.dump(self.sentence_interaction_information,open("models/sentence_stats.p",'wb')) corpus_stats={} corpus_stats["positive_sentences"]=self.positive_sentence_count corpus_stats["negative_sentences"]=self.negative_sentence_count corpus_stats["total_sentences"]=self.total_sentence_count pickle.dump(corpus_stats,open("models/train_corpus_stats.p",'wb'))
################################################# for folder in target_directory.values(): mkdir_if_missing(folder) video_list = {} file_names = get_frame_sequences(source_directory['apex'][:-5], class_folders=view_list) for view in view_list: video_list[view] = list(file_names[view].keys()) # for each view... for view in view_list: # get a list of all files in the relevant view directory (file list) file_list = get_files_list(source_directory[view]) print('Running view:', view) # run through each video... for video in tqdm(video_list[view]): # create a temporary list with only the relevant video frame names video_frame_list = [] for file in file_list: if file == '.DS_Store': continue file_name = re.match(r".+(?=_\d+\.jpg)", file).group() if file_name == video: video_frame_list.append(file) video_frame_list = sorted( video_frame_list, key=lambda x: int(re.search(r'(?<=_)[\d]+', x).group()))
""" OCR with textract http://textract.readthedocs.io/en/stable/ """ from utils import get_files_list, write_file, set_encoding, local_config import textract set_encoding() SOURCE_PATH = local_config('original_files_path') OUTPUT_PATH = local_config('processed_files_path') files = get_files_list(SOURCE_PATH) for file_name in files: if file_name.split('.')[1] == 'pdf': print("[PDF]File: %s" % file_name) text = textract.process(SOURCE_PATH + file_name, method='tesseract', language='eng') elif file_name.split('.')[1] == 'png': print("[PNG]File: %s" % file_name) text = textract.process(SOURCE_PATH + file_name, language='eng') else: print("File: %s" % file_name) text = textract.process(SOURCE_PATH + file_name) print text write_file(OUTPUT_PATH + '[TEXTRACT]_' + file_name.split('.')[0] + '.txt', text)
db_prep_handler = DBPrepHandler() for i in range(1, 3): prep_worker = Worker(f'Prep Worker{i}', prep_queue) prep_worker.register_handler(default_prep_handler) prep_worker.register_handler(db_prep_handler) prep_worker.register_result_queue(db_queue) pool.apply_async(prep_worker.process) orm_handler = ORMHandler() db_handler = DBHandler() for i in range(1, 5): db_worker = Worker(f'DB Worker{i}', db_queue) db_worker.register_handler(orm_handler) db_worker.register_handler(db_handler) pool.apply_async(db_worker.process) pool.close() # Read files name from data directory and put an item per file for path in get_files_list(Path('data')): file_info = file_model_map.get(path.name) if file_info is not None: model, type, batch_size = file_info item = QueueItem(type, {'path': path}, { 'model': model, 'batch_size': batch_size }) file_queue.put(item) pool.join()
self.pairs_collection[pair_id]=ddi entity_1=self.entity_collection[pair_attrs["e1"]]["text"] entity_2=self.entity_collection[pair_attrs["e2"]]["text"] if entity_1 in self.interaction_collection.keys(): if entity_2 not in self.interaction_collection[entity_1].keys(): self.interaction_collection[entity_1][entity_2]=ddi elif entity_2 in self.interaction_collection.keys(): if entity_1 not in self.interaction_collection[entity_2]: self.interaction_collection[entity_2][entity_1]=ddi else: self.interaction_collection[entity_2]={} self.interaction_collection[entity_2][entity_1]=ddi self.interaction_collection[entity_2][entity_1]=ddi document_data.close() test_medline_path= "./Test/Test for DDI Extraction task/MedLine" test_drugbank_path= "./Test/Test for DDI Extraction task/DrugBank" paths=[test_medline_path,test_drugbank_path] paths=utils.get_files_list(paths) test_corpus_instance=test_corpus() for file in paths: test_corpus_instance.parse_ddi_corpus(file) pickle.dump(test_corpus_instance.entity_collection,open("./models/test_entity_collection.p","wb")) pickle.dump(test_corpus_instance.interaction_collection,open("./models/test_interaction_collection.p","wb")) pickle.dump(test_corpus_instance.pairs_collection,open("./models/test_pairs_collection.p","wb")) pprint.pprint(test_corpus_instance.pairs_collection)