Exemple #1
0
def read_json_input(input_data_path):
    """Reads json data with this format:

  [{"URL": "http://example.com/path.html", "Contents": "Text content of webpage here"},
   {"URL": "http://example.com/path.html", "Contents": "Text content of webpage here"}]"""
    if input_data_path == '-':
        data = sys.stdin.read()
    else:
        with open(input_data_path, 'r') as input_json_file:
            data = input_json_file.read()
    corpus = json.loads(data)
    result = page_view_sequence_pb2.DataSetProto()
    for item in corpus:
        page_content_proto = page_view_sequence_pb2.PageContentProto()
        codepoints = set()
        for code_point in item["Contents"]:
            codepoints.add(ord(code_point))
        for code_point in codepoints:
            page_content_proto.codepoints.append(code_point)
        page_view_proto = page_view_sequence_pb2.PageViewProto()
        page_view_proto.contents.append(page_content_proto)
        page_view_sequence = page_view_sequence_pb2.PageViewSequenceProto()
        page_view_sequence.page_views.append(page_view_proto)
        result.sequences.append(page_view_sequence)
    return result
def sequence(views):
    """Helper to create a sequence of page view proto's."""
    result = []
    for view in views:
        page_view = page_view_sequence_pb2.PageViewProto()
        for font_name, codepoints in view.items():
            content = page_view_sequence_pb2.PageContentProto()
            content.font_name = font_name
            content.codepoints.extend(codepoints)
            page_view.contents.append(content)

        result.append(page_view)

    return result
def create_page_view(file_path):
    """Collects all of the codepoints in file_path and converts
  into a page view proto."""

    codepoints = set()
    with open(file_path, encoding='utf-8') as file:
        for char in file.read():
            codepoints.add(ord(char))

    page_view = page_view_sequence_pb2.PageViewProto()
    content = page_view_sequence_pb2.PageContentProto()
    content.font_name = "Roboto-Regular.ttf"
    content.codepoints.extend(sorted(codepoints))
    page_view.contents.append(content)

    return page_view