Esempio n. 1
0
def callback(ch, method, properties, body):
    data = json.loads(body)
    # We need:
    # - name: name of the current sheet
    # - partials: file names of the additional MEI files (including extension) (list).
    # These "partial" MEIs need to be in the "whole" folder of the sheet already, just like the skeleton.mei
    sheet_name = data['name']
    partial_file_names = data['partials']

    # Get sheet id (for status queue)
    sheet_id = str(db[cfg.col_sheet].find_one({"name": sheet_name})["_id"])

    whole_dir = fsm.get_sheet_whole_directory(sheet_name)
    skeleton_path = whole_dir / 'aligned.mei'
    partial_paths = [whole_dir / partial for partial in partial_file_names]

    # skeleton always has 1 section which just contains the measures and some additional tags
    skeleton_document = xml.parse(str(skeleton_path)).documentElement
    skeleton_section = skeleton_document.getElementsByTagName("section")[0]
    skeleton_section_xml = tt.purge_non_element_nodes(skeleton_section).toxml()
    partial_sections_xml = []
    for partial_path in partial_paths:
        if partial_path.is_file():
            partial = xml.parse(str(partial_path))
            # We have to extract the measures and put them under a "fake" section root to get a similar structure as the skeleton
            partial = tt.replace_child_nodes(
                tt.create_element_node("section"),
                partial.getElementsByTagName("measure"))
            partial = tt.purge_non_element_nodes(partial)
            partial_sections_xml.append(partial.toxml())

    # Perform the alignments and node picking
    aligned_trees = ta.align_trees_multiple([skeleton_section_xml] +
                                            partial_sections_xml)
    final_section_tree, _ = ta.build_consensus_tree(
        aligned_trees, consensus_method=ta.consensus_bnd_enrich_skeleton)

    # The final tree only aligned the section with measures, so we need to put the contents of that section back now
    tt.replace_child_nodes(skeleton_section, final_section_tree.childNodes)

    # Write the final tree to a file
    with open(whole_dir / 'aligned.mei', 'w') as aligned_mei_file:
        # We also purge everything that is not an element, to keep the tree clean and easily output a prettified XML file
        aligned_mei_file.write(
            tt.purge_non_element_nodes(skeleton_document).toprettyxml())

    # Update status
    status_update_msg = {
        '_id': sheet_id,
        'module': 'aligner',
        'status': 'complete',
        'name': sheet_name
    }

    global channel
    channel.queue_declare(queue=cfg.mq_omr_planner_status)
    channel.basic_publish(exchange="",
                          routing_key=cfg.mq_omr_planner_status,
                          body=json.dumps(status_update_msg))
Esempio n. 2
0
def callback(ch, method, properties, body):
    # Decode body and obtain pdf id
    data = json.loads(body)
    pdf_id = data['_id']

    # Initiate mongo client and sheet collection
    client = MongoClient(settings.mongo_address[0],
                         int(settings.mongo_address[1]))
    db = client.trompa_test
    sheet_collection = db[settings.sheet_collection_name]

    # Get PDF sheet entry
    pdf_sheet = sheet_collection.find_one(ObjectId(pdf_id))
    print(pdf_sheet)
    pdf_sheet_path = Path(pdf_sheet["sheet_path"])
    pdf_sheet_name = pdf_sheet_path.stem
    if not pdf_sheet:
        raise Exception(f"PDF Sheet under id {pdf_id} does not exist!")

    # PDF -> JPEG
    print("Converting PDF to JPEG page images...")
    pages = convert_from_path(pdf_sheet_path.absolute(), 300)
    img_pages_path = fsm.get_sheet_pages_directory(pdf_sheet_name)
    for index, page in enumerate(pages):
        page_path = img_pages_path / f'page_{index}.jpg'
        page.save(page_path, 'JPEG')
        sheet_collection.update_one({'sheet_path': str(pdf_sheet_path)},
                                    {'$push': {
                                        'pages_path': str(page_path)
                                    }},
                                    upsert=True)
        print(f"{index} pages out of {len(pages)}")
    print("DONE")

    # JPEG -> MEI
    print("Converting JPEG pages to MEI skeleton...")
    to_mei.run(pdf_sheet_name)

    # Update sheet on mongo
    # TODO: This doesn't seem necessary given that the mei will always be called "aligned.mei", the fsm can handle the paths
    mei_path = fsm.get_sheet_whole_directory(pdf_sheet_name) / "aligned.mei"
    sheet_collection.update_one({'_id': ObjectId(pdf_id)},
                                {'$push': {
                                    'mei_path': str(mei_path)
                                }},
                                upsert=True)

    # Output name to sheet queue
    status_update_msg = {
        '_id': pdf_id,
        'module': 'measure_detector',
        'status': 'complete',
        'name': pdf_sheet_name
    }
    add_to_queue('status_queue', 'status_queue', json.dumps(status_update_msg))
    print(
        f"Published PDF->MEI converted sheet {pdf_sheet_name} to message queue!"
    )
def callback(ch, method, properties, body):
    data = json.loads(body)
    sheet_name = data['name']
    task_id = data['task_id']

    client = MongoClient(settings.mongo_address[0],
                         int(settings.mongo_address[1]))
    db = client.trompa_test
    sheet_id = str(db[settings.sheet_collection_name].find_one(
        {"name": sheet_name})["_id"])

    # Obtain aggregated XML
    aggregated_result = db[
        settings.aggregated_result_collection_name].find_one(
            {"task_id": task_id})
    aggregated_xml = xml.parseString("<mei>" + aggregated_result["xml"] +
                                     "</mei>")
    aggregated_dict = {
        x.attributes["n"].value: x
        for x in aggregated_xml.getElementsByTagName("measure")
    }

    # Get MEI file and measures
    mei_path = fsm.get_sheet_whole_directory(sheet_name) / "aligned.mei"
    mei_xml = xml.parse(str(mei_path))
    mei_measures = mei_xml.getElementsByTagName("measure")

    # Replace measures with new info
    for measure in mei_measures:
        mei_n = measure.attributes["n"].value
        if mei_n in aggregated_dict:
            measure.childNodes = aggregated_dict[mei_n].childNodes

    # Write MEI file
    with open(str(mei_path), 'w') as mei_file:
        mei_file.write(mei_xml.toxml())

    status_update_msg = {
        '_id': sheet_id,
        'module': 'score_rebuilder',
        'status': 'complete',
        'name': sheet_name,
        'task_id': task_id
    }

    global channel
    channel.queue_declare(queue="status_queue")
    channel.basic_publish(exchange="",
                          routing_key="status_queue",
                          body=json.dumps(status_update_msg))
def callback(ch, method, properties, body):
    data = json.loads(body)
    sheet_name = data['name']
    task_id = data['task_id']

    client = MongoClient(cfg.mongodb_address.ip, cfg.mongodb_address.port)
    db = client[cfg.db_name]

    # Get MEI file and measures
    mei_path = fsm.get_sheet_whole_directory(sheet_name) / "aligned.mei"
    mei_xml_tree = xml.parse(str(mei_path))
    # mei_measures = mei_xml.getElementsByTagName("measure")

    # Obtain corresponding task and slice
    task = db[cfg.col_task].find_one({"_id": ObjectId(task_id)})
    # measure_staff_slice = db[cfg.col_slice].find_one({"_id" : ObjectId(task["slice_id"])})
    # slice_measures = mei_measures[measure_staff_slice["start"]: measure_staff_slice["end"]]

    # Get aggregated XML
    aggregated_result = db[cfg.col_aggregated_result].find_one({
        "task_id":
        task_id,
        "step":
        task["step"]
    })

    if aggregated_result:
        aggregated_xml = aggregated_result["result"]

        # Temporary solution: give the slice somewhat more context by inserting only the header of the previous measure into it
        tree = xml.parseString(aggregated_xml).documentElement
        index = int(tree.getElementsByTagName("measure")[0].getAttribute(
            "n")) - 1  # n-index is shifted up by 1
        if index > 0:
            measure = mei_xml_tree.getElementsByTagName("measure")[
                index - 1].cloneNode(deep=True)  # get the previous measure
            measure.childNodes = []
            tree.insertBefore(measure, tree.childNodes[0])
            aggregated_xml = tree.toxml()

        # Perform combination with original MEI via tree aligner
        mei_section = mei_xml_tree.getElementsByTagName("section")[0]
        mei_section_xml = mei_section.toxml()
        aligned_trees = ta.align_trees_multiple(
            [mei_section_xml, aggregated_xml],
            distance_function=ta.node_distance_anchored)
        final_section_tree, _ = ta.build_consensus_tree(
            aligned_trees, consensus_method=ta.consensus_bnd_override_inner)
        tt.replace_child_nodes(mei_section, final_section_tree.childNodes)

        # Write MEI file
        with open(str(mei_path), 'w') as mei_file:
            mei_file.write(
                tt.purge_non_element_nodes(
                    mei_xml_tree.documentElement).toprettyxml())

        status_update_msg = {
            '_id': task_id,
            'module': 'score_rebuilder',
            'status': 'complete'
        }
    else:
        print(
            f"Aggregated result for task with id {task_id} at step {task['step']} did not exist!"
        )
        status_update_msg = {
            '_id': task_id,
            'module': 'score_rebuilder',
            'status': 'failed'
        }

    global channel
    channel.queue_declare(queue=cfg.mq_task_scheduler_status)
    channel.basic_publish(exchange="",
                          routing_key=cfg.mq_task_scheduler_status,
                          body=json.dumps(status_update_msg))
def callback(ch, method, properties, body):
    # Decode body and obtain pdf id
    data = json.loads(body)
    pdf_id = data['_id']

    # Initiate mongo client and sheet collection
    client = MongoClient(
        cfg.mongodb_address.ip,
        cfg.mongodb_address.port)
    db = client[cfg.db_name]
    sheet_collection = db[cfg.col_sheet]

    # Get PDF sheet entry
    pdf_sheet = sheet_collection.find_one(ObjectId(pdf_id))
    print(pdf_sheet)
    pdf_sheet_path = Path(pdf_sheet["sheet_path"])
    pdf_sheet_name = pdf_sheet_path.stem
    if not pdf_sheet:
        raise Exception(f"PDF Sheet under id {pdf_id} does not exist!")

    # PDF -> JPEG
    print("Converting PDF to JPEG page images...")
    i = 1
    pages = []
    img_pages_path = fsm.get_sheet_pages_directory(pdf_sheet_name)
    while True:
        try:
            page = convert_from_path(pdf_sheet_path.absolute(), 300, first_page=i, last_page=i+1)[0]
            page_path = img_pages_path / f'page_{i}.jpg'
            page.save(page_path, 'JPEG')
            sheet_collection.update_one({'sheet_path': str(pdf_sheet_path)},
                                        {'$push': {'pages_path': str(page_path)}})
            del page
            print(f"{i} pages out of {len(pages)}")
        except:
            print("Reached end of PDF")
            break
        i += 1
    print("PDF conversion finished succesfully!")

    # JPEG -> MEI
    if (fsm.skeleton_exists(pdf_sheet_name)):
        print("Using pre-existing skeleton, skipping measure detection...")
    else:
        print("Converting JPEG pages to MEI skeleton via measure detector...")
        to_mei.run(pdf_sheet_name, connection)

    # Update sheet on mongo
    mei_path = fsm.get_sheet_whole_directory(pdf_sheet_name) / "aligned.mei"
    sheet_collection.update_one({'_id': ObjectId(pdf_id)},
                                {'$push': {'mei_path': str(mei_path)}})

    # Output name to sheet queue
    status_update_msg = {
        '_id': pdf_id,
        'module': 'measure_detector',
        'status': 'complete',
        'name': pdf_sheet_name}
    add_to_queue(
        cfg.mq_omr_planner_status,
        cfg.mq_omr_planner_status,
        json.dumps(status_update_msg))
    print(f"Published PDF->MEI converted sheet {pdf_sheet_name} to message queue!")
Esempio n. 6
0
    def __init__(self, name):
        # Create all relevant paths and names
        mei_path = fsm.get_sheet_whole_directory(name) / "aligned.mei"
        self.pages_path = fsm.get_sheet_pages_directory(name)
        self.name = name

        # Data structures
        self.measures = []
        self.lines = []
        self.pages = []
        self.images = {}

        # MEI parsing
        self.mei = xml.parse(str(mei_path))

        # Storing the zones in a dict and collect page images
        image_names = []
        zones = {}
        for surface in self.mei.getElementsByTagName("surface"):
            graphic = surface.getElementsByTagName("graphic")[0]
            image_name = graphic.attributes["target"].value
            image_names.append(image_name)
            for zone in surface.getElementsByTagName("zone"):
                zones[zone.attributes["xml:id"].value] = zone

        line = []
        page = []
        entries = [x for x in self.mei.getElementsByTagName("section")[0].childNodes if x.nodeType==xml.Node.ELEMENT_NODE]

        skipped_first_page_tag = False
        accumulated_score_def = self.mei.getElementsByTagName("scoreDef")[0]  # The initial one
        last_score_def_index = -1
        score_def_before_measure = None
        self.context = self.build_initial_context()
        for entry_index, entry in enumerate(entries):
            if entry.tagName == "pb":
                if not skipped_first_page_tag:
                    skipped_first_page_tag = True
                else:
                    self.pages.append(Page(tuple(page), page[0].measures[0].index, len(self.pages), image_names[len(self.pages)]))
                    del page[:]
            if entry.tagName == "scoreDef":
                self.update_score_def_with_score_def(accumulated_score_def, entry)
                last_score_def_index = entry_index
                score_def_before_measure = entry
                pass
            if entry.tagName == "sb" or entry.tagName == "pb" and line:
                line_obj = Line(tuple(line), line[0].index, len(self.lines))
                self.lines.append(line_obj)
                page.append(line_obj)
                del line[:]
            if entry.tagName == "measure":
                staffs = []
                for staff in entry.getElementsByTagName("staff"):
                    zone = zones[staff.attributes["facs"].value[1:]]
                    ulc = tuple([int(v) for v in (zone.attributes["ulx"].value, zone.attributes["uly"].value)])  # Upper left corner
                    lrc = tuple([int(v) for v in (zone.attributes["lrx"].value, zone.attributes["lry"].value)])  # Lower right corner

                    # TODO: this seems redundant, should be solved by making specific slices instead
                    has_clef = False
                    # If the line list is empty, this measure is the first measure, and thus the staff contains a clef
                    if not line:
                        has_clef = True

                    inner_xml = staff.toxml()

                    score_staff = Staff(ulc, lrc, lrc[0]-ulc[0], lrc[1]-ulc[1], len(staffs), len(self.measures), len(self.lines), len(self.pages), inner_xml, has_clef)
                    staffs.append(score_staff)

                # Adapt context for measure
                measure_context = self.context.cloneNode(deep=True)
                measure_context_score_def = measure_context.getElementsByTagName("scoreDef")[0]
                self.update_score_def_with_score_def(measure_context_score_def, accumulated_score_def)
                for staffDef in measure_context_score_def.getElementsByTagName("staffDef"):
                    staff_n = staffDef.getAttribute("n")
                    clef, clef_entry_index = self.backtrack_first_staff_with_clef(entry, entries, staff_n)

                    # If the clef came later than the last scoredef, we should override the clef
                    if clef and clef_entry_index > last_score_def_index:
                        self.update_score_def_with_clef(measure_context_score_def, clef, staff_n)

                score_def_before_measure_xml = None
                if score_def_before_measure:
                    score_def_before_measure_xml = score_def_before_measure.toxml()
                    score_def_before_measure = None

                score_measure = Measure(
                    staffs,
                    len(self.measures),
                    entry.toxml(),
                    measure_context.toxml(),
                    score_def_before_measure_xml)

                self.measures.append(score_measure)
                line.append(score_measure)
def callback(ch, method, properties, body):
    data = json.loads(body)
    sheet_name = data['name']
    post_processing_steps = data["steps"]
    task_id = data['task_id']

    # Get MEI file
    mei_path = fsm.get_sheet_whole_directory(sheet_name) / "aligned.mei"
    mei_xml_tree = tt.purge_non_element_nodes(xml.parse(str(mei_path)))
    mei_section = mei_xml_tree.getElementsByTagName("section")[0]

    if "clef" in post_processing_steps:
        print(f"Performing clef post-processing for sheet {sheet_name}")
        for layer in mei_xml_tree.getElementsByTagName("layer"):
            element = layer.firstChild

            if element != None and element.tagName=="clef":
                staff = layer.parentNode
                measure = staff.parentNode

                clef_line  = element.getAttribute("line")
                clef_shape = element.getAttribute("shape")
                layer.removeChild(element)

                prev = measure.previousSibling
                scoreDef = None

                while prev:
                    if prev.tagName == "measure":
                        break
                    if prev.tagName == "scoreDef":
                        scoreDef = prev
                        break
                    prev = prev.previousSibling

                # TODO: actually generalize this code
                if not scoreDef:
                    scoreDef = tt.create_element_node("scoreDef")
                    mei_section.insertBefore(scoreDef, measure)

                staffGrp = tt.first_or_none(scoreDef, "staffGrp")
                if not staffGrp:
                    staffGrp = tt.create_element_node("staffGrp")
                    scoreDef.appendChild(staffGrp)

                staffDef = tt.first_or_none(staffGrp, "staffDef", lambda e: e.getAttribute("n") == staff.getAttribute("n"))
                if not staffDef:
                    staffDef = tt.create_element_node("staffDef", {"n": staff.getAttribute("n")})
                    staffGrp.appendChild(staffDef)

                staffDef.setAttribute("clef.line", clef_line)
                staffDef.setAttribute("clef.shape", clef_shape)

    # Write MEI file if there were changes
    if post_processing_steps:
        with open(str(mei_path), 'w') as mei_file:
            mei_file.write(tt.purge_non_element_nodes(mei_xml_tree.documentElement).toprettyxml())

    status_update_msg = {
        '_id': task_id,
        'module': 'post_processing',
        'status': 'complete'
    }

    global channel
    channel.queue_declare(queue=cfg.mq_task_scheduler_status)
    channel.basic_publish(exchange="", routing_key=cfg.mq_task_scheduler_status, body=json.dumps(status_update_msg))
Esempio n. 8
0
def run(sheet_name, connection):
    import datetime
    import sys
    sys.path.append("..")
    import common.file_system_manager as fsm
    from common.settings import cfg

    from glob import glob
    import json
    import os
    from uuid import uuid4

    from lxml import etree
    from PIL import Image, ImageFont
    from PIL.ImageDraw import ImageDraw
    import requests
    from tqdm import tqdm

    version = '1.0.0'

    template = f'''<?xml version="1.0" encoding="UTF-8"?>
    <mei xmlns="http://www.music-encoding.org/ns/mei">
        <meiHead>
            <fileDesc>
                <titleStmt>
                    <title/>
                </titleStmt>
                <pubStmt/>
            </fileDesc>
             <encodingDesc>
                <appInfo>
                    <application isodate="{datetime.datetime.now().replace(microsecond=0).isoformat()}" version="{version}">
                        <name>MeasureDetector</name>
                        <p>Measures detected with MeasureDetector</p>
                    </application>
                </appInfo>
            </encodingDesc>
        </meiHead>
        <music>
            <facsimile>
            </facsimile>
            <body>
            </body>
        </music>
    </mei>'''.encode()

    def draw_boxes(image_path, measures):
        image = Image.open(image_path).convert('RGBA')
        overlay = Image.new('RGBA', image.size)
        image_draw = ImageDraw(overlay)

        for measure in measures:
            image_draw.rectangle([
                int(measure['left']),
                int(measure['top']),
                int(measure['right']),
                int(measure['bottom'])
            ],
                                 fill='#00FFFF1B')
        for m, measure in enumerate(measures):
            image_draw.rectangle([
                int(measure['left']),
                int(measure['top']),
                int(measure['right']),
                int(measure['bottom'])
            ],
                                 outline='#008888',
                                 width=2)

        result_image = Image.alpha_composite(image, overlay).convert('RGB')

        target_dir = os.path.join(os.path.dirname(image_path), 'bboxes')
        os.makedirs(target_dir, exist_ok=True)

        basename = os.path.basename(image_path)
        result_path = os.path.join(target_dir, basename)
        result_image.save(result_path)

    # Detect measures
    page_path = fsm.get_sheet_pages_directory(sheet_name)
    image_paths = sorted(
        [str(p.resolve()) for p in page_path.iterdir() if p.is_file()],
        key=lambda x: int(os.path.basename(x).split('_')[1].split('.')[0]))

    pages = []

    tqdm.write(f'Detecting measures in {len(image_paths)} images...')
    for image_path in tqdm(image_paths, unit='img'):
        with open(image_path, 'rb') as image:
            address = ":".join(map(str, cfg.measure_detector_address))
            response = requests.post(f'http://{address}/upload',
                                     files={'image': image})
        measures = json.loads(response.content.decode('utf-8'))['measures']
        pages.append({'path': image_path, 'measures': measures})

    # Generate MEI file
    xml_parser = etree.XMLParser(remove_blank_text=True)
    mei = etree.fromstring(template, parser=xml_parser)

    mei_facsimile = mei.xpath('//*[local-name()="facsimile"]')[0]
    mei_body = mei.xpath('//*[local-name()="body"]')[0]

    mei_mdiv = etree.Element('mdiv')
    mei_mdiv.attrib[
        '{http://www.w3.org/XML/1998/namespace}id'] = 'mdiv_' + str(uuid4())
    mei_mdiv.attrib['n'] = str(1)
    mei_mdiv.attrib['label'] = ''
    mei_body.append(mei_mdiv)

    mei_score = etree.Element('score')
    mei_score.append(etree.Element('scoreDef'))
    mei_mdiv.append(mei_score)

    mei_section = etree.Element('section')
    mei_score.append(mei_section)

    mei_section.append(etree.Element('pb'))

    cur_ulx = 0
    cur_measure = 1

    for p, page in enumerate(pages):
        image = Image.open(page['path'])
        image_width, image_height = image.size
        image.close()

        measures = page['measures']
        print(measures)

        # TODO: restore this functionality in some other way?
        # if args.make_images:
        #     draw_boxes(page['path'], measures)

        mei_surface = etree.Element('surface')
        mei_surface.attrib[
            '{http://www.w3.org/XML/1998/namespace}id'] = 'surface_' + str(
                uuid4())
        mei_surface.attrib['n'] = str(p + 1)
        mei_surface.attrib['ulx'] = str(0)
        mei_surface.attrib['uly'] = str(0)
        mei_surface.attrib['lrx'] = str(image_width - 1)
        mei_surface.attrib['lry'] = str(image_height - 1)
        mei_facsimile.append(mei_surface)

        mei_graphic = etree.Element('graphic')
        mei_graphic.attrib[
            '{http://www.w3.org/XML/1998/namespace}id'] = 'graphic_' + str(
                uuid4())
        mei_graphic.attrib['target'] = os.path.basename(page['path'])
        mei_graphic.attrib['width'] = str(image_width)
        mei_graphic.attrib['height'] = str(image_height)
        mei_surface.append(mei_graphic)

        for m, measure in enumerate(measures):
            print(measure)
            mei_zone = etree.Element('zone')
            mei_zone_id = 'zone_' + str(uuid4())
            mei_zone.attrib[
                '{http://www.w3.org/XML/1998/namespace}id'] = mei_zone_id
            mei_zone.attrib['type'] = 'measure'
            mei_zone.attrib['ulx'] = str(int(measure['ulx']))
            mei_zone.attrib['uly'] = str(int(measure['uly']))
            mei_zone.attrib['lrx'] = str(int(measure['lrx']))
            mei_zone.attrib['lry'] = str(int(measure['lry']))
            mei_surface.append(mei_zone)

            mei_measure = etree.Element('measure')
            mei_measure.attrib[
                '{http://www.w3.org/XML/1998/namespace}id'] = 'measure_' + str(
                    uuid4())
            mei_measure.attrib['n'] = str(cur_measure)
            mei_measure.attrib['label'] = str(cur_measure)
            mei_measure.attrib['facs'] = f'#{mei_zone_id}'
            mei_section.append(mei_measure)
            cur_measure += 1

            if len(measures) > m + 1 and measures[m +
                                                  1]['ulx'] < measure['ulx']:
                mei_section.append(etree.Element('sb'))
            elif len(measures) <= m + 1:
                mei_section.append(etree.Element('sb'))

        mei_section.append(etree.Element('pb'))

    mei_path = fsm.get_sheet_whole_directory(sheet_name)
    mei_file_dir = mei_path / "aligned.mei"
    with open(str(mei_file_dir), 'wb') as file:
        xml = etree.ElementTree(mei)
        xml.write(file,
                  encoding='utf-8',
                  pretty_print=True,
                  xml_declaration=True)

    tqdm.write('Done.')
Esempio n. 9
0
def callback(channel, method, properties, body):
    # Decode body and obtain pdf id
    data = json.loads(body)
    pdf_id = data['_id']

    sheet_collection = db[cfg.col_sheet]

    # Get PDF sheet entry
    pdf_sheet = sheet_collection.find_one(ObjectId(pdf_id))
    print(pdf_sheet)
    pdf_sheet_path = Path(pdf_sheet["sheet_path"])
    pdf_sheet_name = pdf_sheet_path.stem
    if not pdf_sheet:
        raise Exception(f"PDF Sheet under id {pdf_id} does not exist!")

    # PDF -> JPEG
    print("Converting PDF to JPEG page images...")
    # This awkward loop is done to prevent pdf2image from loading the entire PDF into memory
    # which for some reason costs several gigabytes for large sheets...
    i = 1
    pages = []
    img_pages_path = fsm.get_sheet_pages_directory(pdf_sheet_name)
    while True:
        try:
            page = convert_from_path(pdf_sheet_path.absolute(),
                                     300,
                                     first_page=i,
                                     last_page=i + 1)[0]
            page_path = img_pages_path / f'page_{i}.jpg'
            page.save(page_path, 'JPEG')
            sheet_collection.update_one(
                {'sheet_path': str(pdf_sheet_path)},
                {'$push': {
                    'pages_path': str(page_path)
                }})
            del page
            print(f"{i} pages out of {len(pages)}")
        except:
            print("Reached end of PDF")
            break
        i += 1
    print("PDF conversion finished succesfully!")

    # JPEG -> MEI
    if cfg.skip_measure_detection and fsm.skeleton_exists(pdf_sheet_name):
        print("Using pre-existing skeleton, skipping measure detection...")
    else:
        print("Converting JPEG pages to MEI skeleton via measure detector...")
        to_mei.run(pdf_sheet_name, connection)

    # Update sheet on mongo
    mei_path = fsm.get_sheet_whole_directory(pdf_sheet_name) / "aligned.mei"
    sheet_collection.update_one({'_id': ObjectId(pdf_id)},
                                {'$push': {
                                    'mei_path': str(mei_path)
                                }})

    # Output name to sheet queue
    status_update_msg = {
        '_id': pdf_id,
        'module': 'measure_detector',
        'status': 'complete',
        'name': pdf_sheet_name
    }

    channel.basic_publish(exchange='',
                          routing_key=cfg.mq_omr_planner_status,
                          body=json.dumps(status_update_msg))
    channel.basic_ack(method.delivery_tag)
    print(
        f"Published PDF->MEI converted sheet {pdf_sheet_name} to message queue!"
    )
Esempio n. 10
0
def run(sheet_name, connection):

    version = '1.0.0'

    template = f'''<?xml version="1.0" encoding="UTF-8"?>
    <mei xmlns="http://www.music-encoding.org/ns/mei">
        <meiHead>
            <fileDesc>
                <titleStmt>
                    <title/>
                </titleStmt>
                <pubStmt/>
            </fileDesc>
             <encodingDesc>
                <appInfo>
                    <application isodate="{datetime.datetime.now().replace(microsecond=0).isoformat()}" version="{version}">
                        <name>MeasureDetector</name>
                        <p>Measures detected with MeasureDetector</p>
                    </application>
                </appInfo>
            </encodingDesc>
        </meiHead>
        <music>
            <facsimile>
            </facsimile>
            <body>
            </body>
        </music>
    </mei>'''.encode()

    # Detect measures
    page_path = fsm.get_sheet_pages_directory(sheet_name)
    image_paths = sorted(
        [str(p.resolve()) for p in page_path.iterdir() if p.is_file()],
        key=lambda x: int(os.path.basename(x).split('_')[1].split('.')[0]))

    results = []

    tqdm.write(f'Detecting measures in {len(image_paths)} images...')
    for image_path in tqdm(image_paths, unit='img'):
        page = detector.detect_measures(image_path)
        results.append({'path': image_path, 'page': page})
        connection.process_data_events()

    # Generate MEI file
    xml_parser = etree.XMLParser(remove_blank_text=True)
    mei = etree.fromstring(template, parser=xml_parser)

    mei_facsimile = mei.xpath('//*[local-name()="facsimile"]')[0]
    mei_body = mei.xpath('//*[local-name()="body"]')[0]

    mei_mdiv = etree.Element('mdiv')
    mei_mdiv.attrib[
        '{http://www.w3.org/XML/1998/namespace}id'] = 'mdiv_' + str(uuid4())
    mei_mdiv.attrib['n'] = str(1)
    mei_mdiv.attrib['label'] = ''
    mei_body.append(mei_mdiv)

    mei_score = etree.Element('score')
    mei_score_def = etree.Element('scoreDef')
    mei_score.append(mei_score_def)
    mei_mdiv.append(mei_score)

    mei_section = etree.Element('section')
    mei_score.append(mei_section)

    mei_section.append(etree.Element('pb'))

    cur_measure, cur_staff = 1, 1

    staff_counts = []
    section_lengths = []
    measures_per_page = []
    for p, result in enumerate(results):
        print("Processing page", page)
        page, path = result['page'], result['path']

        mei_surface = etree.Element('surface')
        mei_surface.attrib[
            '{http://www.w3.org/XML/1998/namespace}id'] = 'surface_' + str(
                uuid4())
        mei_surface.attrib['n'] = str(p + 1)
        mei_surface.attrib['ulx'] = str(0)
        mei_surface.attrib['uly'] = str(0)
        mei_surface.attrib['lrx'] = str(page.width - 1)
        mei_surface.attrib['lry'] = str(page.height - 1)
        mei_facsimile.append(mei_surface)

        mei_graphic = etree.Element('graphic')
        mei_graphic.attrib[
            '{http://www.w3.org/XML/1998/namespace}id'] = 'graphic_' + str(
                uuid4())
        mei_graphic.attrib['target'] = os.path.basename(path)
        mei_graphic.attrib['width'] = str(page.width)
        mei_graphic.attrib['height'] = str(page.height)
        mei_surface.append(mei_graphic)

        for s, system in enumerate(page.systems):
            for m, measure in enumerate(system.measures):

                mei_measure = etree.Element('measure')
                mei_measure.attrib[
                    '{http://www.w3.org/XML/1998/namespace}id'] = 'measure_' + str(
                        uuid4())
                mei_measure.attrib['n'] = str(cur_measure)
                mei_measure.attrib['label'] = str(cur_measure)
                mei_section.append(mei_measure)

                cur_staff = 1
                for st, staff in enumerate(measure.staffs):
                    mei_zone = etree.Element('zone')
                    mei_zone_id = 'zone_' + str(uuid4())
                    mei_zone.attrib[
                        '{http://www.w3.org/XML/1998/namespace}id'] = mei_zone_id
                    mei_zone.attrib['type'] = 'staff'
                    mei_zone.attrib['ulx'] = str(int(staff.ulx))
                    mei_zone.attrib['uly'] = str(int(staff.uly))
                    mei_zone.attrib['lrx'] = str(int(staff.lrx))
                    mei_zone.attrib['lry'] = str(int(staff.lry))
                    mei_surface.append(mei_zone)

                    mei_staff = etree.Element('staff')
                    mei_staff.attrib[
                        '{http://www.w3.org/XML/1998/namespace}id'] = 'staff_' + str(
                            uuid4())
                    mei_staff.attrib['n'] = str(cur_staff)
                    mei_staff.attrib['label'] = str(cur_staff)
                    mei_staff.attrib['facs'] = f'#{mei_zone_id}'

                    # Staffs should have at least one layer, can optionally be enumerated with "n" if we need more
                    mei_layer = etree.Element('layer')
                    mei_staff.append(mei_layer)

                    mei_measure.append(mei_staff)

                    cur_staff += 1
                staff_counts.append(cur_staff - 1)
                cur_measure += 1
            mei_section.append(etree.Element('sb'))
            section_lengths.append(cur_measure - 1 - sum(section_lengths))
        mei_section.append(etree.Element('pb'))
        measures_per_page.append(cur_measure - 1 - sum(measures_per_page))

    # Add the most likely staff configuration to the scoredef
    # NOTE: does not generalize to scores with more than one staff configuration
    mei_staff_group = etree.Element('staffGrp')
    mei_score_def.append(mei_staff_group)
    for i in range(round(np.mean(staff_counts))):
        n = i + 1
        mei_staff_def = etree.Element('staffDef')
        mei_staff_def.attrib['n'] = str(n)
        mei_staff_def.attrib['lines'] = '5'  # Render looks weird without lines
        mei_staff_group.append(mei_staff_def)

    # Print some detection statistics
    print("Detection Statistics:")
    print(f"{'  mean staff count:':<20}{np.mean(np.mean(staff_counts))}")
    print(f"{'  mean line length:':<20}{np.mean(np.mean(section_lengths))}")
    print(
        f"{'  mean measures per page:':<20}{np.mean(np.mean(measures_per_page))}"
    )
    print(f"{'  measures per page:'}")
    for i, count in enumerate(measures_per_page):
        page = i + 1
        print(f"{'  - ' + str(page):<20}{count}")

    mei_path = fsm.get_sheet_whole_directory(sheet_name)
    mei_file_dir = mei_path / "aligned.mei"
    with open(str(mei_file_dir), 'wb') as file:
        xml = etree.ElementTree(mei)
        xml.write(file, encoding='utf-8', pretty_print=True)

    tqdm.write('Done.')
Esempio n. 11
0
def callback(ch, method, properties, body):
    data = json.loads(body)
    sheet_name = data['name']

    # Get sheet id
    sheet_id = str(db[cfg.col_sheet].find_one({"name" : sheet_name})["_id"])

    # Github
    github = Github(cfg.github_token)
    org = github.get_organization(cfg.github_organization)

    if cfg.delete_if_exists:
        try:
            org.get_repo(sheet_name).delete()
            print("Deleted existing repo for", sheet_name)
        except GithubException as e:
            print("Repo doesn't exist, ready for creation!")
            print(str(e))
        # if "name already exists on this account" in str(e):

    # TODO: Handling this properly requires offline functionality for the git-repo, meaning we have to
    #       create it without relying on Github and then link it if possible
    repo = org.create_repo(sheet_name, description=f"Repository for {sheet_name}", auto_init=True)

    # Git
    git_dir_path = fsm.get_clean_sheet_git_directory(sheet_name)
    clone = None
    tries = 0
    while clone==None and tries < 5:
        try:
            clone = pygit2.clone_repository(repo.clone_url, str(git_dir_path))
        except pygit2.GitError:
            print("Could not clone repo at:", repo.clone_url, ", trying again in 1 second...")
            connection.process_data_events()
            tries += 1
            time.sleep(1)

    status = "complete"
    if clone != None:

        clone.remotes.set_url("origin", repo.clone_url)

        # Add the PDF
        pdf_path = fsm.get_sheet_whole_directory(sheet_name) / (sheet_name + ".pdf")
        shutil.copy(str(pdf_path), str(fsm.get_sheet_git_directory(sheet_name)))
        commit(clone, "Initialize main branch")

        pushed = False
        push_tries = 0
        while not pushed and push_tries < 5:
            try:
                push(clone)
                pushed = True
            except pygit2.GitError:
                print(f"Could not push for score {sheet_name}, retrying in 1 second...")
                connection.process_data_events()
                push_tries += 1
                time.sleep(1)
        if pushed:
            # Add the MEI
            clone.create_branch(cfg.github_branch, clone.head.peel())
            branch = clone.lookup_branch(cfg.github_branch)
            ref = clone.lookup_reference(branch.name)
            clone.checkout(ref)

            mei_path = fsm.get_sheet_whole_directory(sheet_name) / "aligned.mei"
            shutil.copy(str(mei_path), str(fsm.get_sheet_git_directory(sheet_name)))
            commit(clone, "Initialize crowd manager branch", branch=cfg.github_branch)
            pushed_branch = False
            branch_push_tries = 0
            while not pushed_branch and branch_push_tries < 5:
                try:
                    push(clone, branch=cfg.github_branch, force=True)
                    pushed_branch = True
                except pygit2.GitError:
                    print(f"Could not push for score {sheet_name}, retrying in 1 second...")
                    connection.process_data_events()
                    branch_push_tries += 1
                    time.sleep(1)

            if pushed and pushed_branch:
                # Protect the newly created/pushed branch and the main branch on Github
                repo.get_branch("main").edit_protection(user_push_restrictions=[cfg.github_user])
                repo.get_branch(cfg.github_branch).edit_protection(user_push_restrictions=[cfg.github_user])

            if not pushed_branch:
                print("Warning, could not push crowd manager's branch for", sheet_name)
                status = "failed"

            del branch
            del ref
        else:
            print("Warning, could not push initial commit for", sheet_name)
            status = "failed"

        # Clean up (needed since pygit2 tends to leave files in .git open)
        del clone
        gc.collect()
    else:
        print("Warning, could not initialize repo for", sheet_name)
        status = "failed"

    # Update status
    status_update_msg = {
        '_id': sheet_id,
        'module': 'github_init',
        'status': status,
        'name': sheet_name
    }

    global channel
    channel.queue_declare(queue=cfg.mq_omr_planner_status)
    channel.basic_publish(exchange="", routing_key=cfg.mq_omr_planner_status, body=json.dumps(status_update_msg))
Esempio n. 12
0
def callback(ch, method, properties, body):
    data = json.loads(body)
    sheet_name = data['name']

    # Get sheet id
    client = MongoClient(cfg.mongodb_address.ip, cfg.mongodb_address.port)
    db = client[cfg.db_name]
    sheet_id = str(db[cfg.col_sheet].find_one({"name" : sheet_name})["_id"])

    # Github
    github = Github(cfg.github_token)
    org = github.get_organization(cfg.github_organization)

    if cfg.delete_if_exists:
        try:
            org.get_repo(sheet_name).delete()
        except GithubException as e:
            print("Repo doesn't exist, ready for creation!")
            print(str(e))
        # if "name already exists on this account" in str(e):

    # TODO: Handling this properly requires offline functionality for the git-repo, meaning we have to
    #       create it without relying on Github and then link it if possible
    repo = org.create_repo(sheet_name, description=f"Repository for {sheet_name}", auto_init=True)

    # Git
    git_dir_path = fsm.get_clean_sheet_git_directory(sheet_name)
    clone = pygit2.clone_repository(repo.clone_url, str(git_dir_path))
    clone.remotes.set_url("origin", repo.clone_url)

    # Add the PDF
    pdf_path = fsm.get_sheet_whole_directory(sheet_name) / (sheet_name + ".pdf")
    shutil.copy(str(pdf_path), str(fsm.get_sheet_git_directory(sheet_name)))
    commit(clone, "Initialize main branch")
    push(clone)

    # Add the MEI
    clone.create_branch(cfg.github_branch, clone.head.peel())
    branch = clone.lookup_branch(cfg.github_branch)
    ref = clone.lookup_reference(branch.name)
    clone.checkout(ref)

    mei_path = fsm.get_sheet_whole_directory(sheet_name) / "aligned.mei"
    shutil.copy(str(mei_path), str(fsm.get_sheet_git_directory(sheet_name)))
    commit(clone, "Initialize crowd manager branch", branch=cfg.github_branch)
    push(clone, branch=cfg.github_branch)

    # Protect the newly created/pushed branch and the main branch on Github
    repo.get_branch("main").edit_protection(user_push_restrictions=[cfg.github_user])
    repo.get_branch(cfg.github_branch).edit_protection(user_push_restrictions=[cfg.github_user])

    # Clean up (needed since pygit2 tends to leave files in .git open)
    del clone
    del branch
    del ref
    gc.collect()

    # Update status
    status_update_msg = {
        '_id': sheet_id,
        'module': 'github_init',
        'status': 'complete',
        'name': sheet_name
    }

    global channel
    channel.queue_declare(queue=cfg.mq_omr_planner_status)
    channel.basic_publish(exchange="", routing_key=cfg.mq_omr_planner_status, body=json.dumps(status_update_msg))