Esempi in Python per get_filename, esempi in Python per utils.utils.get_filename

Esempio n. 1

0

Mostra file

    def read_item(self, item_path):
        item_atts = self.att_dict[get_filename(item_path)]
        if self.meets_requirements(item_atts):
            labels = self.get_metadata(item_atts)

            # TO-DO: for the moment check_Data_balance only considers the
            # first label of the list (the instrument)
            self.data.append(item_path)
            if len(self.attribute_list) != 0:
                self.metadata.append(labels)
        else:
            pass

Esempio n. 2

0

Mostra file

File: data.py Progetto: uyaseen/bionlp-ost-2019

def parse_test(data_path, w_path, doc_w_path=None, doc_token_span_w_path=None):
    if doc_token_span_w_path and not file_exists(doc_token_span_w_path):
        print('{} not found, computing doc-level-span information dictionary'.
              format(doc_token_span_w_path))
        documents_spans = get_real_token_span(data_path)
        # keep a copy of token spans to avoid re-computing it during training etc.,
        write_pickle(documents_spans, doc_token_span_w_path)
        print('{} created'.format(doc_token_span_w_path))
    else:
        documents_spans = read_pickle(doc_token_span_w_path)
    txt_files = get_files(data_path, ext='txt')
    documents_tokens = []
    documents_pos = []
    documents_ortho = []
    documents_fname = []
    for txt_path in txt_files:
        document_tokens = []
        document_pos = []
        document_ortho = []
        document_fname = []
        f_name = get_filename(txt_path)
        sentences = documents_spans[f_name]
        for sentence in sentences:
            sentence_tokens = []
            sentence_pos = []
            sentence_ortho = []
            sentence_fname = []
            for word_dictio in sentence:
                sentence_tokens.append(word_dictio['word'])
                sentence_pos.append(word_dictio['pos'])
                sentence_ortho.append(get_ortho_feature(word_dictio['word']))
                sentence_fname.append(f_name)
            document_tokens.append(sentence_tokens)
            document_pos.append(sentence_pos)
            document_ortho.append(sentence_ortho)
            document_fname.append(sentence_fname)
        documents_tokens.append(document_tokens)
        documents_pos.append(document_pos)
        documents_ortho.append(document_ortho)
        documents_fname.append(document_fname)
    write_bio_test(w_path,
                   documents_tokens,
                   documents_pos,
                   documents_ortho,
                   documents_fname,
                   sentence_level=True)
    if doc_w_path:
        write_bio_test(doc_w_path,
                       documents_tokens,
                       documents_pos,
                       documents_ortho,
                       documents_fname,
                       sentence_level=False)

Esempio n. 3

0

Mostra file

File: compranet-ocds.py Progetto: procurement-analytics/compranet-data

def download_compranet(years):
  """
  Download Compranet data for a list of years, unzip the files and convert 
  the XLS to CSV

  :param years:
    The years for which to download data
  :type years:
    List

  :returns:

  :example:

  """
  
  tmp_folder = os.path.join(settings.folder_full_cache, 'tmp')
  check_create_folder(tmp_folder)

  for year in years:
    file_name = os.path.join(settings.fn_prefix + year + settings.fn_extension)
    src_url = settings.compranet_base_url + file_name

    print "Downloading %s" % file_name
    download(url=src_url, path=tmp_folder) 

    file_path = os.path.join(tmp_folder, file_name)
    with zipfile.ZipFile(file_path, 'r') as myzip:
      myzip.extractall(tmp_folder)

  pattern = os.path.join(tmp_folder, '*.xls*')

  for src_file in list_files(pattern):
    csv_path = os.path.join(settings.folder_full_cache, get_filename(src_file) + '.csv')
    wb = xlrd.open_workbook(src_file)
    sheet = wb.sheet_by_index(0)

    with open(csv_path, 'w') as csvfile:
      writer = unicodecsv.writer(csvfile, encoding='utf-8')
      for rownum in xrange(sheet.nrows):
        writer.writerow(sheet.row_values(rownum))

  remove_folder(tmp_folder)

Esempio n. 4

0

Mostra file

File: pollinator_extraction.py Progetto: redbuttegarden/conservation

def main():
    # Setup for database logging
    setup()
    manual = args["manual_selection"]
    machine_learning = args["use_model"]
    create_classification_folders(CLASSES, args["write_path"])
    kernel = np.ones((5, 5), np.uint8)

    # The reference digits are computed based on a supplied reference photo
    # We assume the reference photo contains all the digits 0-9 from left to right
    reference_digits = process_reference_digits()

    for vdir in get_video_list(args["video_path"]):
        for video in vdir.files:
            last_log = get_last_entry(manual, video)
            if last_log:
                count = last_log.id
            else:
                # High initial count just to reduce the chances of accidentally overwriting existing files
                count = 30000
            print("[*] Analyzing video", video)
            vs = FileVideoStream(os.path.join(vdir.directory, video)).start()

            # The current frame number
            f_num = 0

            # Allow the buffer some time to fill
            time.sleep(1.0)

            # Motion tracking needs only be initialized once per video
            motion_saliency = None

            # Threshold values to control the sensitivity of contours found in the motion map
            # We assume that contours with extreme areas (very large or small) are not likely to be pollinators
            thresh1 = 100
            thresh2 = 10000

            # Is the timestamp in this video parsable yet?
            time_parsable = False
            ts_box = None

            while vs.more():
                frame = vs.read()

                # If the frame is None, the video is done being processed and we can move to the next one
                if frame is None:
                    break
                else:
                    f_num += 1

                if manual:
                    # We check if there are log entries indicating the frame has already been analyzed
                    if last_log is not None:
                        if f_num < last_log.frame:
                            print(
                                "[*] Frame number {} has already been analyzed. Waiting for frame number {}..."
                                .format(f_num, last_log.frame))
                            # Continue to the next frame if the logs indicate we have analyzed frames later than this
                            # one
                            time.sleep(0.1)
                            continue

                    for pollinator, box in manual_selection(frame, f_num):
                        if pollinator is not None:
                            w, h, _ = pollinator.shape
                            area = w * h
                            file_name = get_filename(count, video)
                            cv2.imwrite(
                                os.path.join(args['write_path'], "Pollinator",
                                             file_name), pollinator)
                            frame_time, ts_box = compute_frame_time(
                                frame, reference_digits, time_parsable, ts_box)
                            frame_time, time_parsable = check_frame_time(
                                frame, frame_time, reference_digits,
                                time_parsable, ts_box)
                            if frame_time is None:
                                print(
                                    "[!] Failed to process time, probably because the frame is distorted. "
                                    "Skipping frame...")
                                continue

                            add_log_entry(directory=vdir.directory,
                                          video=video,
                                          time=frame_time,
                                          classification="Pollinator",
                                          size=area,
                                          bbox=box,
                                          frame_number=f_num,
                                          name=file_name,
                                          manual=manual)
                            count += 1
                else:
                    # This block is executed once per video
                    if motion_saliency is None:
                        # Motion saliency tracks the important parts of the video by finding differences between the
                        # current frame and previous frames
                        motion_saliency = cv2.saliency.MotionSaliencyBinWangApr2014_create(
                        )
                        motion_saliency.setImagesize(frame.shape[1],
                                                     frame.shape[0])
                        motion_saliency.init()

                    # Even though this frame might be processed already, we still want to have the motion saliency
                    # object analyze it so that when we do get a frame we haven't seen before, the motion saliency
                    # object is can see the differences in it based on past frames
                    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                    (success,
                     motion_map) = motion_saliency.computeSaliency(gray)

                    # We check if there are log entries indicating the frame has already been analyzed
                    if last_log is not None:
                        if f_num < last_log.frame:
                            print(
                                "[*] Frame number {} has already been analyzed. Waiting for frame number {}..."
                                .format(f_num, last_log.frame))
                            # Continue to the next frame if the logs indicate we have analyzed frames later than this
                            # one
                            continue

                    frame_time, ts_box = compute_frame_time(
                        frame, reference_digits, time_parsable, ts_box)
                    frame_time, time_parsable = check_frame_time(
                        frame, frame_time, reference_digits, time_parsable,
                        ts_box)

                    count = analyze_motion(motion_map, frame, f_num,
                                           frame_time, kernel, thresh1,
                                           thresh2, video, count, vdir,
                                           machine_learning)

            vs.stop()

            # Video done being processed. Move to finished folder
            os.rename(
                os.path.join(vdir.directory, video),
                os.path.join(args["video_path"], "Processed Videos", video))

    cv2.destroyAllWindows()

Esempio n. 5

0

Mostra file

File: pollinator_extraction.py Progetto: redbuttegarden/conservation

def analyze_motion(motion_map, frame, f_num, frame_time, kernel, thresh1,
                   thresh2, video, count, vdir, machine_learning):

    # Map that represents differences from the previous frames as white areas
    motion_map = (motion_map * 255).astype("uint8")
    # This morph reduces noise in the image while preserving more substantial white areas
    morph = cv2.morphologyEx(motion_map, cv2.MORPH_OPEN, kernel)

    # Flag if the frame doesn't contain any pollinators
    all_neg = False

    possible_pollinators = []

    # Draw contours around the white spots that meet our threshold criteria
    cnts, bounding_boxes = get_contours(morph,
                                        lower_thresh=thresh1,
                                        upper_thresh=thresh2)
    for cnt, box in zip(cnts, bounding_boxes):
        x, y, w, h = box

        # Expand the box a bit to ensure the pollinator is in the crop
        x -= 20
        y -= 20
        w += 20
        h += 20

        # Crop the pollinator with some buffer around the bounding box, limited by frame dimensions
        crop = frame[max(0, y):min(frame.shape[0], y + h),
                     max(0, x):min(frame.shape[1], x + w)]

        # Get the area of the contour box
        area = cv2.contourArea(cnt)

        # Save the location of the bounding box for later reference
        box = get_formatted_box(x, y, w, h)

        # Create a copy of the frame for each contour so we can show the user one contour at a time
        pollinators = frame.copy()

        # Draw a rectangle around the potential pollinator
        cv2.rectangle(pollinators, (x, y), (x + w, y + h), (0, 0, 255), 1)

        if machine_learning:
            # A dictionary to hold info about this possible pollinator
            possible_pol = {}

            # Create a copy of the possible pollinator for the model to consume
            model_image = crop.copy()
            model_image = pre_process(model_image)

            possible_pol["Image"] = model_image
            possible_pol["Area"] = area
            possible_pol["Box"] = box

            possible_pollinators.append(possible_pol)
        else:
            file_name = get_filename(count, video)

            # Show the user the frame with the potential pollinator highlighted with a rectangle
            cv2.imshow("Video Frame", pollinators)

            if all_neg is False:
                # This function displayed the cropped out potential pollinator and allows the user to classify it
                classification = classify(args['write_path'], file_name, crop)

                if classification == "No pollinators in frame.":
                    all_neg = True
                    classification = CLASSES[1]
                # Whatever the user decided is recorded in the log database
                add_log_entry(directory=vdir.directory,
                              video=video,
                              time=frame_time,
                              name=file_name,
                              classification=classification,
                              size=area,
                              bbox=box,
                              frame_number=f_num)
            else:
                classification = CLASSES[1]
                # All negative is true, so we know every contour is not a pollinator
                add_log_entry(directory=vdir.directory,
                              video=video,
                              time=frame_time,
                              name=file_name,
                              classification=classification,
                              size=area,
                              bbox=box,
                              frame_number=f_num)
            count += 1

    if machine_learning and len(possible_pollinators) > 0:
        images = [pol["Image"] for pol in possible_pollinators]
        images = np.vstack(images)
        # Classify the cropped images
        results = model_classifier(model_path, images)

        print(results)

        for pol, result in zip(possible_pollinators, results):
            not_pollinator, pollinator = result
            proba = pollinator if pollinator > not_pollinator else not_pollinator
            proba *= 100

            # Build the label
            label = get_label(not_pollinator, pollinator)

            x, y, w, h = pol["Box"].split()

            if label == "Pollinator":
                # Draw a rectangle around the potential pollinator
                cv2.rectangle(frame, (int(x), int(y)),
                              (int(x) + int(w), int(y) + int(h)), (0, 255, 0),
                              1)

                add_log_entry(directory=vdir.directory,
                              video=video,
                              time=frame_time,
                              classification="Pollinator",
                              proba=proba,
                              size=pol["Area"],
                              bbox=pol["Box"],
                              frame_number=f_num)
                count += 1
            else:
                cv2.rectangle(frame, (int(x), int(y)),
                              (int(x) + int(w), int(y) + int(h)), (0, 0, 255),
                              1)

        cv2.imshow("Pollinators", frame)
        cv2.waitKey(0)

    return count

Esempio n. 6

0

Mostra file

File: B-Test-3-生成单一分类结果.py Progetto: yangkangyf/Tianchi-lung-nodule-detection-and-classification

model_cube_30.load_weights(model_paths + 'Fenge_32_32_32_0704.h5')
model_cube_30 = model_s

# In[5]:

#test_pred_0 = pd.read_csv(pred_csv_path + "1final_test_result.csv")
test_pred_0 = pd.read_csv(pred_csv_path + "1final_test_result.csv")

# In[6]:

patients = [x for x in os.listdir(pred_csv_path) if 'orig' in x]

# In[7]:

test_pred_0["file"] = test_pred_0["seriesuid"].map(
    lambda file_name: get_filename(patients, file_name))
test_pred_0 = test_pred_0.dropna()

# In[8]:


#生成分类的分类尺寸32*32*32
def get_cube_from_img(img3d, center_x, center_y, center_z, block_size):
    start_x = max(center_x - block_size / 2, 0)
    if start_x + block_size > img3d.shape[2]:
        start_x = img3d.shape[2] - block_size

    start_y = max(center_y - block_size / 2, 0)
    start_z = max(center_z - block_size / 2, 0)
    if start_z + block_size > img3d.shape[0]:
        start_z = img3d.shape[0] - block_size

Esempio n. 7

0

Mostra file

def main(arguments):
    pollinator_class_completer = pollinator_setup(arguments)

    for vdir in get_video_list(arguments["video_path"]):
        split = vdir.directory.split("/")[
            -2:]  # Extract site and plant info from directory path
        site = split[0]
        plant = split[1]
        for video in vdir.files:
            print(
                "[*] Analyzing video {} from site {}, plant number {}.".format(
                    video, site, plant))
            last_log = get_last_entry(True, video)

            vs = FileVideoStream(os.path.join(vdir.directory, video)).start()

            # The current frame number and pollinator count
            f_num = 0
            count = 0

            # Allow the buffer some time to fill
            time.sleep(2.0)

            # Keep a list of previous frames
            previous_frames = []

            while vs.more():
                frame = vs.read()

                # If the frame is None, the video is done being processed and we can move to the next one
                if frame is None:
                    break
                else:
                    f_num += 1
                    if last_log is not None and f_num <= last_log.frame:
                        print(
                            "[*] Frame number {} has already been analyzed. Waiting for frame number {}..."
                            .format(f_num, last_log.frame + 1))
                        # Continue to the next frame if the logs indicate we have analyzed frames later than this
                        # one
                        time.sleep(
                            0.01)  # Sleep here so we don't overtake the buffer
                        continue
                """
                Because previous frames are passed to manual selection,
                the pollinator selection may not have occurred on the
                current frame. Therefore, the frame number used for
                file names and logging will need to be calculated.
                """
                previous_frames = handle_previous_frames(
                    frame, previous_frames)
                pollinator, box, labeled_frame = manual_selection(
                    f_num, previous_frames)
                if pollinator is None and box is None and labeled_frame is None:
                    continue

                fnum_calc = calculate_frame_number(labeled_frame,
                                                   previous_frames, f_num)
                frame_fname = get_filename(fnum_calc, count, video, frame=True)
                if pollinator is not False and pollinator is not None:
                    # Save the whole frame as a pollinator
                    print("[*] Saving frame as an example of Pollinator.")
                    cv2.imwrite(
                        os.path.join(arguments["write_path"], "Frames",
                                     "Pollinator", frame_fname), labeled_frame)

                    # And save the pollinator
                    pol_fname = get_filename(fnum_calc, count, video)
                    count = handle_pollinator(arguments, pol_fname, vdir,
                                              count, fnum_calc, pollinator,
                                              box, pollinator_class_completer,
                                              video)
                elif pollinator is False and box is None:
                    # Save the whole frame as an example of no pollinator
                    print("[*] Saving frame as an example of Not_Pollinator.")
                    img_path = os.path.join(arguments["write_path"], "Frames",
                                            "Not_Pollinator", frame_fname)
                    cv2.imwrite(img_path, labeled_frame)
                    w, h, _ = frame.shape
                    size = w * h
                    print("[*] Logging this frame as Not_Pollinator.")
                    add_log_entry(
                        directory=vdir.directory,
                        video=video,
                        time=None,
                        classification="Not_Pollinator",
                        pollinator_id=None,
                        proba=None,
                        genus=None,
                        species=None,
                        behavior=None,
                        size=size,
                        bbox="Whole",  # Entire frame
                        size_class=None,
                        frame_number=fnum_calc,
                        manual=True,
                        img_path=img_path,
                    )

            vs.stop()

    cv2.destroyAllWindows()

Esempio n. 8

0

Mostra file

K.set_image_dim_ordering('th')
model_cube_30, model_s = get_net()
model_cube_30.load_weights(model_paths + 'Fenge_32_32_32_0704.h5')
model_cube_30 = model_s
# In[5]:

#加载分类结果文件
#test_pred_0 = pd.read_csv(pred_csv_path + "1_vgg_final_result.csv")
test_pred_0 = pd.read_csv(
    '/devdata1/ding/data/TianChi/ali/all_1final/1final_test_result_v.csv')

# In[6]:

seriesuids = pd.read_csv(PATH['annotations_val'] + "seriesuids.csv")
test_pred_0["file"] = test_pred_0["seriesuid"].map(
    lambda file_name: get_filename(seriesuids['seriesuid'].values, file_name))
test_pred_0 = test_pred_0.dropna()

# In[7]:

patients = [x for x in os.listdir(pred_csv_path) if 'orig' in x]

# In[8]:

test_pred_0["file"] = test_pred_0["seriesuid"].map(
    lambda file_name: get_filename(patients, file_name))
test_pred_0 = test_pred_0.dropna()

# In[9]:

Esempio n. 9

0

Mostra file

File: ocds.py Progetto: procurement-analytics/compranet-data

def generate_json(df):
  """
  Generate OCDS record packages for each month

  :param df:
    Dataframe with all the contracts
  :type args:
    DataFrame

  :returns:
      
  :example:

  """

  check_create_folder(settings.folder_ocds_json)
  check_create_folder(settings.folder_tmp)
  clean_folder(settings.folder_tmp)

  # Group the Compranet by date
  df['group_date'] = df[settings.grouping_date].convert_objects(convert_dates='coerce')
  grouped_df = df.set_index('group_date').groupby(pd.TimeGrouper(freq='M'))

  # Store the records for each month in a temporary CSV file
  # The JSON files will be generated from these CSV files, which
  # is much more performant than iterating over the rows in pandas
  files = []
  for month, records in grouped_df:
    if not records.empty:
      m = month.strftime("%Y%m%d")
      file_name = os.path.join(settings.folder_tmp, m + '.csv')
      files.append(file_name)
      records.to_csv(file_name, index=False)

  # Loop over each CSV file and create an OCDS record package
  for f in files:

    # Store the package meta-data
    ## ADD MONTH
    package = {
      "uri": os.path.join("http://example.com/" + get_filename(f) + '.json'),
      "publishedDate": get_filename(f),
      "records": [],
      "publisher": {
        "identifier": "100",
        "name": "Compranet"
      },
      "packages": []
    }

    # Read the file and generate the records
    with open(f, 'rb') as infile:
      data = csv.DictReader(infile)

      ocds_records = {}

      for record in data:
        record_id = record['NUMERO_PROCEDIMIENTO']

        # Add the generic tender data for this record,
        # if it's not there already
        if not record_id in ocds_records:
          ocds_records[record_id] = get_tender_data(record)

        # The contract and award data needs to be added for each row

        # OCDS expects a unique ID for every award. NUMERO_EXPEDIENTE is not unique, hence
        # a custom ID
        award_id = str(record['NUMERO_EXPEDIENTE']) + '-' + str(len(ocds_records[record_id]['awards']) + 1)
        
        ocds_records[record_id]['awards'].append(get_award_data(record, award_id))
        ocds_records[record_id]['contracts'].append(get_contract_data(record, award_id))

      for key, value in ocds_records.iteritems():
        package['records'].append(value)

    ofn = os.path.join(settings.folder_ocds_json, get_filename(f) + '.json')
    with open(ofn, 'w') as outfile:
      json.dump(package, outfile)

Esempio n. 10

0

Mostra file

File: data.py Progetto: uyaseen/bionlp-ost-2019

def get_real_token_span(directory):
    '''
    :param directory: path of raw text files
    :return: documents :: dictionary --> key: document name, values = [[[{'word': XX, 'start': X, 'end': X}]]]
    '''
    files = get_files(directory, ext='txt')
    documents = {}
    med_tagger = Med_Tagger()  # Starts a docker image in background
    file_counter = 0
    print('get_real_token_span::{} files to process'.format(len(files)))
    for file in files:
        file_counter += 1
        if file_counter % 100 == 0:
            print('.')
        with codecs.open(file, 'r', encoding='utf-8') as f:
            text = f.read()
        tokens_space_offsets = []
        text_space_sp = text.split(' ')
        off_set = 0
        for token in text_space_sp:
            token_offset = {
                'token': token,
                'start': off_set,
                'end': off_set + len(token)
            }
            tokens_space_offsets.append(token_offset)
            off_set += len(token)
            off_set += 1
        # sanity check if captured token indexes are correct across all the tokens
        for t_offset in tokens_space_offsets:
            token = t_offset['token']
            off_set_st = t_offset['start']
            off_set_end = t_offset['end']
            assert token == text[off_set_st:off_set_end]
        tokens_space_offsets_ptr = 0
        last_inside_token_end_idx = None
        sentences_in_doc = []
        for line in codecs.open(file, 'r', encoding='utf-8'):
            if line.strip():
                sentences_tokenized, sentences_pos = tokenize(line.strip(),
                                                              med_tagger,
                                                              return_pos=True)
                assert sentences_tokenized is not None and sentences_pos is not None
                for sentence_tokenized, sentence_pos in zip(
                        sentences_tokenized, sentences_pos):
                    words_in_sentence = []
                    # sentence_tokenized = web_tokenizer(sent)
                    for word, pos in zip(sentence_tokenized, sentence_pos):
                        token = tokens_space_offsets[tokens_space_offsets_ptr][
                            'token']
                        start = tokens_space_offsets[tokens_space_offsets_ptr][
                            'start']
                        end = tokens_space_offsets[tokens_space_offsets_ptr][
                            'end']
                        if word == token:
                            tokens_space_offsets_ptr += 1
                            last_inside_token_end_idx = None
                        elif word in token:
                            if not last_inside_token_end_idx:
                                # remove \n from the start of text
                                adjust_span(text, start, end, update_end=False)
                                end = start + len(word)
                            else:
                                start = last_inside_token_end_idx
                                end = start + len(word)
                                start, end = adjust_span(text, start, end)

                            start, end = adjust_span(text, start, end)
                            end_of_token_space = tokens_space_offsets[
                                tokens_space_offsets_ptr]['end']
                            # is it the end of current word, if yes increment the pointer
                            if end == end_of_token_space:
                                tokens_space_offsets_ptr += 1
                                last_inside_token_end_idx = None
                            else:
                                last_inside_token_end_idx = end
                            # part of ugly checks (pharmaco::->test)
                            if text[end:end_of_token_space] == '\x85':
                                tokens_space_offsets_ptr += 1
                                last_inside_token_end_idx = None
                        # final sanity test
                        assert word == text[start: end], 'word={} != text={}\ntokens_space_offsets_ptr:{}' \
                                                         '\nsent_tokenized: {}\ndocument: {}'.\
                            format(word, text[start: end], tokens_space_offsets_ptr,
                                   sentence_tokenized, get_filename(file))
                        words_dictio = {
                            'word': word,
                            'start': start,
                            'end': end,
                            'pos': pos
                        }
                        words_in_sentence.append(words_dictio)

                    sentences_in_doc.append(words_in_sentence)

        f_name = get_filename(file)
        documents[f_name] = sentences_in_doc
    # clean-up
    del med_tagger
    return documents

Esempio n. 11

0

Mostra file

File: data.py Progetto: uyaseen/bionlp-ost-2019

def parse(data_path,
          w_path,
          doc_token_span_w_path=None,
          ann_file_ext='ann',
          append_i_tag=True):
    create_directory(get_parent_directory(w_path))
    if not file_exists(doc_token_span_w_path):
        print('{} not found, computing doc-level-span information dictionary'.
              format(doc_token_span_w_path))
        documents_spans = get_real_token_span(data_path)
        # keep a copy of token spans to avoid re-computing it during training etc.,
        write_pickle(documents_spans, doc_token_span_w_path)
        print('{} created'.format(doc_token_span_w_path))
    else:
        documents_spans = read_pickle(doc_token_span_w_path)
    txt_files = get_files(data_path, ext='txt')
    documents_tokens = []
    documents_tags = []
    documents_pos = []
    documents_ortho = []
    documents_segment = []
    documents_fname = []
    for txt_path in txt_files:
        document_tokens = []
        document_tags = []
        document_pos = []
        document_ortho = []
        document_segment = []
        document_fname = []
        att_path = join_path(
            data_path, '{}.{}'.format(get_filename(txt_path), ann_file_ext))
        entities_dict = parse_annotation_file(att_path)
        f_name = get_filename(txt_path)
        sentences = documents_spans[f_name]
        for sentence in sentences:
            sentence_tokens = []
            sentence_tags = []
            sentence_pos = []
            sentence_ortho = []
            sentence_segment = []
            sentence_fname = []
            for word_dictio in sentence:
                _, tag = is_token_an_entity(word_dictio, entities_dict)
                if append_i_tag:
                    if tag != 'O':
                        tag = 'I-{}'.format(tag)
                segment = 'O' if tag == 'O' else 'I-SEGMENT'
                sentence_tokens.append(word_dictio['word'])
                sentence_tags.append(tag)
                sentence_pos.append(word_dictio['pos'])
                sentence_ortho.append(get_ortho_feature(word_dictio['word']))
                sentence_segment.append(segment)
                sentence_fname.append(f_name)
            document_tokens.append(sentence_tokens)
            document_tags.append(sentence_tags)
            document_pos.append(sentence_pos)
            document_ortho.append(sentence_ortho)
            document_segment.append(sentence_segment)
            document_fname.append(sentence_fname)
        documents_tokens.append(document_tokens)
        documents_tags.append(document_tags)
        documents_pos.append(document_pos)
        documents_ortho.append(document_ortho)
        documents_segment.append(document_segment)
        documents_fname.append(document_fname)
    write_bio(w_path, documents_tokens, documents_tags, documents_pos,
              documents_ortho, documents_segment, documents_fname)