def read_item(self, item_path): item_atts = self.att_dict[get_filename(item_path)] if self.meets_requirements(item_atts): labels = self.get_metadata(item_atts) # TO-DO: for the moment check_Data_balance only considers the # first label of the list (the instrument) self.data.append(item_path) if len(self.attribute_list) != 0: self.metadata.append(labels) else: pass
def parse_test(data_path, w_path, doc_w_path=None, doc_token_span_w_path=None): if doc_token_span_w_path and not file_exists(doc_token_span_w_path): print('{} not found, computing doc-level-span information dictionary'. format(doc_token_span_w_path)) documents_spans = get_real_token_span(data_path) # keep a copy of token spans to avoid re-computing it during training etc., write_pickle(documents_spans, doc_token_span_w_path) print('{} created'.format(doc_token_span_w_path)) else: documents_spans = read_pickle(doc_token_span_w_path) txt_files = get_files(data_path, ext='txt') documents_tokens = [] documents_pos = [] documents_ortho = [] documents_fname = [] for txt_path in txt_files: document_tokens = [] document_pos = [] document_ortho = [] document_fname = [] f_name = get_filename(txt_path) sentences = documents_spans[f_name] for sentence in sentences: sentence_tokens = [] sentence_pos = [] sentence_ortho = [] sentence_fname = [] for word_dictio in sentence: sentence_tokens.append(word_dictio['word']) sentence_pos.append(word_dictio['pos']) sentence_ortho.append(get_ortho_feature(word_dictio['word'])) sentence_fname.append(f_name) document_tokens.append(sentence_tokens) document_pos.append(sentence_pos) document_ortho.append(sentence_ortho) document_fname.append(sentence_fname) documents_tokens.append(document_tokens) documents_pos.append(document_pos) documents_ortho.append(document_ortho) documents_fname.append(document_fname) write_bio_test(w_path, documents_tokens, documents_pos, documents_ortho, documents_fname, sentence_level=True) if doc_w_path: write_bio_test(doc_w_path, documents_tokens, documents_pos, documents_ortho, documents_fname, sentence_level=False)
def download_compranet(years): """ Download Compranet data for a list of years, unzip the files and convert the XLS to CSV :param years: The years for which to download data :type years: List :returns: :example: """ tmp_folder = os.path.join(settings.folder_full_cache, 'tmp') check_create_folder(tmp_folder) for year in years: file_name = os.path.join(settings.fn_prefix + year + settings.fn_extension) src_url = settings.compranet_base_url + file_name print "Downloading %s" % file_name download(url=src_url, path=tmp_folder) file_path = os.path.join(tmp_folder, file_name) with zipfile.ZipFile(file_path, 'r') as myzip: myzip.extractall(tmp_folder) pattern = os.path.join(tmp_folder, '*.xls*') for src_file in list_files(pattern): csv_path = os.path.join(settings.folder_full_cache, get_filename(src_file) + '.csv') wb = xlrd.open_workbook(src_file) sheet = wb.sheet_by_index(0) with open(csv_path, 'w') as csvfile: writer = unicodecsv.writer(csvfile, encoding='utf-8') for rownum in xrange(sheet.nrows): writer.writerow(sheet.row_values(rownum)) remove_folder(tmp_folder)
def main(): # Setup for database logging setup() manual = args["manual_selection"] machine_learning = args["use_model"] create_classification_folders(CLASSES, args["write_path"]) kernel = np.ones((5, 5), np.uint8) # The reference digits are computed based on a supplied reference photo # We assume the reference photo contains all the digits 0-9 from left to right reference_digits = process_reference_digits() for vdir in get_video_list(args["video_path"]): for video in vdir.files: last_log = get_last_entry(manual, video) if last_log: count = last_log.id else: # High initial count just to reduce the chances of accidentally overwriting existing files count = 30000 print("[*] Analyzing video", video) vs = FileVideoStream(os.path.join(vdir.directory, video)).start() # The current frame number f_num = 0 # Allow the buffer some time to fill time.sleep(1.0) # Motion tracking needs only be initialized once per video motion_saliency = None # Threshold values to control the sensitivity of contours found in the motion map # We assume that contours with extreme areas (very large or small) are not likely to be pollinators thresh1 = 100 thresh2 = 10000 # Is the timestamp in this video parsable yet? time_parsable = False ts_box = None while vs.more(): frame = vs.read() # If the frame is None, the video is done being processed and we can move to the next one if frame is None: break else: f_num += 1 if manual: # We check if there are log entries indicating the frame has already been analyzed if last_log is not None: if f_num < last_log.frame: print( "[*] Frame number {} has already been analyzed. Waiting for frame number {}..." .format(f_num, last_log.frame)) # Continue to the next frame if the logs indicate we have analyzed frames later than this # one time.sleep(0.1) continue for pollinator, box in manual_selection(frame, f_num): if pollinator is not None: w, h, _ = pollinator.shape area = w * h file_name = get_filename(count, video) cv2.imwrite( os.path.join(args['write_path'], "Pollinator", file_name), pollinator) frame_time, ts_box = compute_frame_time( frame, reference_digits, time_parsable, ts_box) frame_time, time_parsable = check_frame_time( frame, frame_time, reference_digits, time_parsable, ts_box) if frame_time is None: print( "[!] Failed to process time, probably because the frame is distorted. " "Skipping frame...") continue add_log_entry(directory=vdir.directory, video=video, time=frame_time, classification="Pollinator", size=area, bbox=box, frame_number=f_num, name=file_name, manual=manual) count += 1 else: # This block is executed once per video if motion_saliency is None: # Motion saliency tracks the important parts of the video by finding differences between the # current frame and previous frames motion_saliency = cv2.saliency.MotionSaliencyBinWangApr2014_create( ) motion_saliency.setImagesize(frame.shape[1], frame.shape[0]) motion_saliency.init() # Even though this frame might be processed already, we still want to have the motion saliency # object analyze it so that when we do get a frame we haven't seen before, the motion saliency # object is can see the differences in it based on past frames gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) (success, motion_map) = motion_saliency.computeSaliency(gray) # We check if there are log entries indicating the frame has already been analyzed if last_log is not None: if f_num < last_log.frame: print( "[*] Frame number {} has already been analyzed. Waiting for frame number {}..." .format(f_num, last_log.frame)) # Continue to the next frame if the logs indicate we have analyzed frames later than this # one continue frame_time, ts_box = compute_frame_time( frame, reference_digits, time_parsable, ts_box) frame_time, time_parsable = check_frame_time( frame, frame_time, reference_digits, time_parsable, ts_box) count = analyze_motion(motion_map, frame, f_num, frame_time, kernel, thresh1, thresh2, video, count, vdir, machine_learning) vs.stop() # Video done being processed. Move to finished folder os.rename( os.path.join(vdir.directory, video), os.path.join(args["video_path"], "Processed Videos", video)) cv2.destroyAllWindows()
def analyze_motion(motion_map, frame, f_num, frame_time, kernel, thresh1, thresh2, video, count, vdir, machine_learning): # Map that represents differences from the previous frames as white areas motion_map = (motion_map * 255).astype("uint8") # This morph reduces noise in the image while preserving more substantial white areas morph = cv2.morphologyEx(motion_map, cv2.MORPH_OPEN, kernel) # Flag if the frame doesn't contain any pollinators all_neg = False possible_pollinators = [] # Draw contours around the white spots that meet our threshold criteria cnts, bounding_boxes = get_contours(morph, lower_thresh=thresh1, upper_thresh=thresh2) for cnt, box in zip(cnts, bounding_boxes): x, y, w, h = box # Expand the box a bit to ensure the pollinator is in the crop x -= 20 y -= 20 w += 20 h += 20 # Crop the pollinator with some buffer around the bounding box, limited by frame dimensions crop = frame[max(0, y):min(frame.shape[0], y + h), max(0, x):min(frame.shape[1], x + w)] # Get the area of the contour box area = cv2.contourArea(cnt) # Save the location of the bounding box for later reference box = get_formatted_box(x, y, w, h) # Create a copy of the frame for each contour so we can show the user one contour at a time pollinators = frame.copy() # Draw a rectangle around the potential pollinator cv2.rectangle(pollinators, (x, y), (x + w, y + h), (0, 0, 255), 1) if machine_learning: # A dictionary to hold info about this possible pollinator possible_pol = {} # Create a copy of the possible pollinator for the model to consume model_image = crop.copy() model_image = pre_process(model_image) possible_pol["Image"] = model_image possible_pol["Area"] = area possible_pol["Box"] = box possible_pollinators.append(possible_pol) else: file_name = get_filename(count, video) # Show the user the frame with the potential pollinator highlighted with a rectangle cv2.imshow("Video Frame", pollinators) if all_neg is False: # This function displayed the cropped out potential pollinator and allows the user to classify it classification = classify(args['write_path'], file_name, crop) if classification == "No pollinators in frame.": all_neg = True classification = CLASSES[1] # Whatever the user decided is recorded in the log database add_log_entry(directory=vdir.directory, video=video, time=frame_time, name=file_name, classification=classification, size=area, bbox=box, frame_number=f_num) else: classification = CLASSES[1] # All negative is true, so we know every contour is not a pollinator add_log_entry(directory=vdir.directory, video=video, time=frame_time, name=file_name, classification=classification, size=area, bbox=box, frame_number=f_num) count += 1 if machine_learning and len(possible_pollinators) > 0: images = [pol["Image"] for pol in possible_pollinators] images = np.vstack(images) # Classify the cropped images results = model_classifier(model_path, images) print(results) for pol, result in zip(possible_pollinators, results): not_pollinator, pollinator = result proba = pollinator if pollinator > not_pollinator else not_pollinator proba *= 100 # Build the label label = get_label(not_pollinator, pollinator) x, y, w, h = pol["Box"].split() if label == "Pollinator": # Draw a rectangle around the potential pollinator cv2.rectangle(frame, (int(x), int(y)), (int(x) + int(w), int(y) + int(h)), (0, 255, 0), 1) add_log_entry(directory=vdir.directory, video=video, time=frame_time, classification="Pollinator", proba=proba, size=pol["Area"], bbox=pol["Box"], frame_number=f_num) count += 1 else: cv2.rectangle(frame, (int(x), int(y)), (int(x) + int(w), int(y) + int(h)), (0, 0, 255), 1) cv2.imshow("Pollinators", frame) cv2.waitKey(0) return count
model_cube_30.load_weights(model_paths + 'Fenge_32_32_32_0704.h5') model_cube_30 = model_s # In[5]: #test_pred_0 = pd.read_csv(pred_csv_path + "1final_test_result.csv") test_pred_0 = pd.read_csv(pred_csv_path + "1final_test_result.csv") # In[6]: patients = [x for x in os.listdir(pred_csv_path) if 'orig' in x] # In[7]: test_pred_0["file"] = test_pred_0["seriesuid"].map( lambda file_name: get_filename(patients, file_name)) test_pred_0 = test_pred_0.dropna() # In[8]: #生成分类的分类尺寸32*32*32 def get_cube_from_img(img3d, center_x, center_y, center_z, block_size): start_x = max(center_x - block_size / 2, 0) if start_x + block_size > img3d.shape[2]: start_x = img3d.shape[2] - block_size start_y = max(center_y - block_size / 2, 0) start_z = max(center_z - block_size / 2, 0) if start_z + block_size > img3d.shape[0]: start_z = img3d.shape[0] - block_size
def main(arguments): pollinator_class_completer = pollinator_setup(arguments) for vdir in get_video_list(arguments["video_path"]): split = vdir.directory.split("/")[ -2:] # Extract site and plant info from directory path site = split[0] plant = split[1] for video in vdir.files: print( "[*] Analyzing video {} from site {}, plant number {}.".format( video, site, plant)) last_log = get_last_entry(True, video) vs = FileVideoStream(os.path.join(vdir.directory, video)).start() # The current frame number and pollinator count f_num = 0 count = 0 # Allow the buffer some time to fill time.sleep(2.0) # Keep a list of previous frames previous_frames = [] while vs.more(): frame = vs.read() # If the frame is None, the video is done being processed and we can move to the next one if frame is None: break else: f_num += 1 if last_log is not None and f_num <= last_log.frame: print( "[*] Frame number {} has already been analyzed. Waiting for frame number {}..." .format(f_num, last_log.frame + 1)) # Continue to the next frame if the logs indicate we have analyzed frames later than this # one time.sleep( 0.01) # Sleep here so we don't overtake the buffer continue """ Because previous frames are passed to manual selection, the pollinator selection may not have occurred on the current frame. Therefore, the frame number used for file names and logging will need to be calculated. """ previous_frames = handle_previous_frames( frame, previous_frames) pollinator, box, labeled_frame = manual_selection( f_num, previous_frames) if pollinator is None and box is None and labeled_frame is None: continue fnum_calc = calculate_frame_number(labeled_frame, previous_frames, f_num) frame_fname = get_filename(fnum_calc, count, video, frame=True) if pollinator is not False and pollinator is not None: # Save the whole frame as a pollinator print("[*] Saving frame as an example of Pollinator.") cv2.imwrite( os.path.join(arguments["write_path"], "Frames", "Pollinator", frame_fname), labeled_frame) # And save the pollinator pol_fname = get_filename(fnum_calc, count, video) count = handle_pollinator(arguments, pol_fname, vdir, count, fnum_calc, pollinator, box, pollinator_class_completer, video) elif pollinator is False and box is None: # Save the whole frame as an example of no pollinator print("[*] Saving frame as an example of Not_Pollinator.") img_path = os.path.join(arguments["write_path"], "Frames", "Not_Pollinator", frame_fname) cv2.imwrite(img_path, labeled_frame) w, h, _ = frame.shape size = w * h print("[*] Logging this frame as Not_Pollinator.") add_log_entry( directory=vdir.directory, video=video, time=None, classification="Not_Pollinator", pollinator_id=None, proba=None, genus=None, species=None, behavior=None, size=size, bbox="Whole", # Entire frame size_class=None, frame_number=fnum_calc, manual=True, img_path=img_path, ) vs.stop() cv2.destroyAllWindows()
K.set_image_dim_ordering('th') model_cube_30, model_s = get_net() model_cube_30.load_weights(model_paths + 'Fenge_32_32_32_0704.h5') model_cube_30 = model_s # In[5]: #加载分类结果文件 #test_pred_0 = pd.read_csv(pred_csv_path + "1_vgg_final_result.csv") test_pred_0 = pd.read_csv( '/devdata1/ding/data/TianChi/ali/all_1final/1final_test_result_v.csv') # In[6]: seriesuids = pd.read_csv(PATH['annotations_val'] + "seriesuids.csv") test_pred_0["file"] = test_pred_0["seriesuid"].map( lambda file_name: get_filename(seriesuids['seriesuid'].values, file_name)) test_pred_0 = test_pred_0.dropna() # In[7]: patients = [x for x in os.listdir(pred_csv_path) if 'orig' in x] # In[8]: test_pred_0["file"] = test_pred_0["seriesuid"].map( lambda file_name: get_filename(patients, file_name)) test_pred_0 = test_pred_0.dropna() # In[9]:
def generate_json(df): """ Generate OCDS record packages for each month :param df: Dataframe with all the contracts :type args: DataFrame :returns: :example: """ check_create_folder(settings.folder_ocds_json) check_create_folder(settings.folder_tmp) clean_folder(settings.folder_tmp) # Group the Compranet by date df['group_date'] = df[settings.grouping_date].convert_objects(convert_dates='coerce') grouped_df = df.set_index('group_date').groupby(pd.TimeGrouper(freq='M')) # Store the records for each month in a temporary CSV file # The JSON files will be generated from these CSV files, which # is much more performant than iterating over the rows in pandas files = [] for month, records in grouped_df: if not records.empty: m = month.strftime("%Y%m%d") file_name = os.path.join(settings.folder_tmp, m + '.csv') files.append(file_name) records.to_csv(file_name, index=False) # Loop over each CSV file and create an OCDS record package for f in files: # Store the package meta-data ## ADD MONTH package = { "uri": os.path.join("http://example.com/" + get_filename(f) + '.json'), "publishedDate": get_filename(f), "records": [], "publisher": { "identifier": "100", "name": "Compranet" }, "packages": [] } # Read the file and generate the records with open(f, 'rb') as infile: data = csv.DictReader(infile) ocds_records = {} for record in data: record_id = record['NUMERO_PROCEDIMIENTO'] # Add the generic tender data for this record, # if it's not there already if not record_id in ocds_records: ocds_records[record_id] = get_tender_data(record) # The contract and award data needs to be added for each row # OCDS expects a unique ID for every award. NUMERO_EXPEDIENTE is not unique, hence # a custom ID award_id = str(record['NUMERO_EXPEDIENTE']) + '-' + str(len(ocds_records[record_id]['awards']) + 1) ocds_records[record_id]['awards'].append(get_award_data(record, award_id)) ocds_records[record_id]['contracts'].append(get_contract_data(record, award_id)) for key, value in ocds_records.iteritems(): package['records'].append(value) ofn = os.path.join(settings.folder_ocds_json, get_filename(f) + '.json') with open(ofn, 'w') as outfile: json.dump(package, outfile)
def get_real_token_span(directory): ''' :param directory: path of raw text files :return: documents :: dictionary --> key: document name, values = [[[{'word': XX, 'start': X, 'end': X}]]] ''' files = get_files(directory, ext='txt') documents = {} med_tagger = Med_Tagger() # Starts a docker image in background file_counter = 0 print('get_real_token_span::{} files to process'.format(len(files))) for file in files: file_counter += 1 if file_counter % 100 == 0: print('.') with codecs.open(file, 'r', encoding='utf-8') as f: text = f.read() tokens_space_offsets = [] text_space_sp = text.split(' ') off_set = 0 for token in text_space_sp: token_offset = { 'token': token, 'start': off_set, 'end': off_set + len(token) } tokens_space_offsets.append(token_offset) off_set += len(token) off_set += 1 # sanity check if captured token indexes are correct across all the tokens for t_offset in tokens_space_offsets: token = t_offset['token'] off_set_st = t_offset['start'] off_set_end = t_offset['end'] assert token == text[off_set_st:off_set_end] tokens_space_offsets_ptr = 0 last_inside_token_end_idx = None sentences_in_doc = [] for line in codecs.open(file, 'r', encoding='utf-8'): if line.strip(): sentences_tokenized, sentences_pos = tokenize(line.strip(), med_tagger, return_pos=True) assert sentences_tokenized is not None and sentences_pos is not None for sentence_tokenized, sentence_pos in zip( sentences_tokenized, sentences_pos): words_in_sentence = [] # sentence_tokenized = web_tokenizer(sent) for word, pos in zip(sentence_tokenized, sentence_pos): token = tokens_space_offsets[tokens_space_offsets_ptr][ 'token'] start = tokens_space_offsets[tokens_space_offsets_ptr][ 'start'] end = tokens_space_offsets[tokens_space_offsets_ptr][ 'end'] if word == token: tokens_space_offsets_ptr += 1 last_inside_token_end_idx = None elif word in token: if not last_inside_token_end_idx: # remove \n from the start of text adjust_span(text, start, end, update_end=False) end = start + len(word) else: start = last_inside_token_end_idx end = start + len(word) start, end = adjust_span(text, start, end) start, end = adjust_span(text, start, end) end_of_token_space = tokens_space_offsets[ tokens_space_offsets_ptr]['end'] # is it the end of current word, if yes increment the pointer if end == end_of_token_space: tokens_space_offsets_ptr += 1 last_inside_token_end_idx = None else: last_inside_token_end_idx = end # part of ugly checks (pharmaco::->test) if text[end:end_of_token_space] == '\x85': tokens_space_offsets_ptr += 1 last_inside_token_end_idx = None # final sanity test assert word == text[start: end], 'word={} != text={}\ntokens_space_offsets_ptr:{}' \ '\nsent_tokenized: {}\ndocument: {}'.\ format(word, text[start: end], tokens_space_offsets_ptr, sentence_tokenized, get_filename(file)) words_dictio = { 'word': word, 'start': start, 'end': end, 'pos': pos } words_in_sentence.append(words_dictio) sentences_in_doc.append(words_in_sentence) f_name = get_filename(file) documents[f_name] = sentences_in_doc # clean-up del med_tagger return documents
def parse(data_path, w_path, doc_token_span_w_path=None, ann_file_ext='ann', append_i_tag=True): create_directory(get_parent_directory(w_path)) if not file_exists(doc_token_span_w_path): print('{} not found, computing doc-level-span information dictionary'. format(doc_token_span_w_path)) documents_spans = get_real_token_span(data_path) # keep a copy of token spans to avoid re-computing it during training etc., write_pickle(documents_spans, doc_token_span_w_path) print('{} created'.format(doc_token_span_w_path)) else: documents_spans = read_pickle(doc_token_span_w_path) txt_files = get_files(data_path, ext='txt') documents_tokens = [] documents_tags = [] documents_pos = [] documents_ortho = [] documents_segment = [] documents_fname = [] for txt_path in txt_files: document_tokens = [] document_tags = [] document_pos = [] document_ortho = [] document_segment = [] document_fname = [] att_path = join_path( data_path, '{}.{}'.format(get_filename(txt_path), ann_file_ext)) entities_dict = parse_annotation_file(att_path) f_name = get_filename(txt_path) sentences = documents_spans[f_name] for sentence in sentences: sentence_tokens = [] sentence_tags = [] sentence_pos = [] sentence_ortho = [] sentence_segment = [] sentence_fname = [] for word_dictio in sentence: _, tag = is_token_an_entity(word_dictio, entities_dict) if append_i_tag: if tag != 'O': tag = 'I-{}'.format(tag) segment = 'O' if tag == 'O' else 'I-SEGMENT' sentence_tokens.append(word_dictio['word']) sentence_tags.append(tag) sentence_pos.append(word_dictio['pos']) sentence_ortho.append(get_ortho_feature(word_dictio['word'])) sentence_segment.append(segment) sentence_fname.append(f_name) document_tokens.append(sentence_tokens) document_tags.append(sentence_tags) document_pos.append(sentence_pos) document_ortho.append(sentence_ortho) document_segment.append(sentence_segment) document_fname.append(sentence_fname) documents_tokens.append(document_tokens) documents_tags.append(document_tags) documents_pos.append(document_pos) documents_ortho.append(document_ortho) documents_segment.append(document_segment) documents_fname.append(document_fname) write_bio(w_path, documents_tokens, documents_tags, documents_pos, documents_ortho, documents_segment, documents_fname)