Ejemplo n.º 1
0
def ris_txt_lines_to_metajson_list(txt_lines, source, rec_id_prefix,
                                   only_first_record):
    document = None
    ris_type = None
    rec_type = None
    is_part_of_rec_type = None
    previous_key = None
    previous_value = None
    for line in txt_lines:
        if line:
            line = line.rstrip('\r\n')
            #logging.debug("line: {}".format(line))

            # multi line management
            if previous_key:
                key = previous_key
                value = previous_value + line
                previous_key = None
                previous_value = None
            else:
                key = line[:2].strip()
                value = line[6:].strip()
            if value.endswith("/") and key not in ["Y1", "PY"]:
                #logging.debug("multi line")
                previous_key = key
                previous_value = value.rstrip('/')
                continue

            if key is None or len(key) == 0:
                # empty line -> continue
                #logging.debug("empty line")
                continue
            elif key == RIS_KEY_BEGIN:
                # record begin with document type -> create document
                # init
                document = Document()
                is_part_of_rec_type = None

                if source:
                    document["rec_source"] = source
                ris_type = value
                rec_type = ris_document_type_to_metajson_document_type[
                    ris_type]
                document["rec_type"] = rec_type
                if ris_type in ris_document_type_to_metajson_document_is_part_of_type:
                    is_part_of_rec_type = ris_document_type_to_metajson_document_is_part_of_type[
                        ris_type]
                    is_part_of = Document()
                    is_part_of["rec_type"] = is_part_of_rec_type
                    document["is_part_ofs"] = [is_part_of]
            elif key == RIS_KEY_END:
                # record end -> return the result
                # verify the is_part_ofs[0]["title"]
                if "is_part_ofs" in document and "title" not in document[
                        "is_part_ofs"][0] and "title_abbreviateds" in document[
                            "is_part_ofs"][0]:
                    document["is_part_ofs"][0]["title"] = document[
                        "is_part_ofs"][0]["title_abbreviateds"][0]["title"]
                    del document["is_part_ofs"][0]["title_abbreviateds"]

                logging.info("# RIS type: {}".format(ris_type))
                metajson_service.pretty_print_document(document)
                yield document
            else:
                # process key value
                #logging.debug("key: {}; value: {}".format(key, value))
                if key == "ID":
                    document["rec_id"] = value
                elif key in [
                        "T1", "TI", "CT"
                ] or (key == "BT"
                      and ris_type in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]):
                    # Title Primary -> title
                    document["title"] = value
                elif key in [
                        "JF", "JO"
                ] or (key == "BT"
                      and ris_type not in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]):
                    # Title Secondary -> is_part_of["title"]
                    document.add_is_part_of_title(value)
                elif key in ["JA", "J1", "J2", "T2"]:
                    # Title Secondary -> is_part_of["title_abbreviateds"][O]["title"]
                    document.add_is_part_of_title_abbreviated(value)
                elif key == "T3":
                    # Title Series
                    document.add_series_title(value)
                elif key in ["A1", "AU"]:
                    document.add_creator(
                        creator_service.formatted_name_to_creator(
                            value, None, "aut"))
                elif key in ["A2", "ED"]:
                    if is_part_of_rec_type:
                        document.add_is_part_of_creator(
                            creator_service.formatted_name_to_creator(
                                value, None, "edt"))
                    else:
                        document.add_creator(
                            creator_service.formatted_name_to_creator(
                                value, None, "edt"))
                elif key == "A3":
                    document.add_series_creator(
                        creator_service.formatted_name_to_creator(
                            value, None, "aut"))
                elif key == "A4":
                    document.add_creator(
                        creator_service.formatted_name_to_creator(
                            value, None, "ctb"))
                elif key in ["PY", "Y1", "DA"]:
                    index_slash = value.find("/")
                    if index_slash != -1:
                        # YYYY/MM/DD/other info (like season)
                        # todo
                        document["date_issued"] = value.strip("/")
                    else:
                        document["date_issued"] = value
                elif key == "SP":
                    document["part_page_begin"] = value
                elif key == "EP":
                    document["part_page_end"] = value
                elif key == "VL":
                    document["part_volume"] = value
                elif key in ["IS", "CP"]:
                    document["part_issue"] = value
                elif key in ["AB", "N2"]:
                    document["descriptions"] = [{
                        "language": "und",
                        "value": value
                    }]
                elif key == "N1":
                    document["notes"] = [{"language": "und", "value": value}]
                elif key == "PB":
                    document.add_item_to_key(value, "publishers")
                elif key == "CY":
                    document.add_item_to_key(value, "publication_places")
                elif key == "RP":
                    document["publication_status"] = value
                elif key == "ET":
                    document["edition"] = value
                elif key == "UR":
                    resource = Resource()
                    resource["url"] = value
                    document.add_item_to_key(resource, "resources")
                elif key == "AN":
                    # Accession Number
                    identifier = metajson_service.create_identifier(
                        "accessionnumber", value)
                    document.add_identifier(identifier)
                elif key == "CN":
                    # Call Number
                    identifier = metajson_service.create_identifier(
                        "callnumber", value)
                    document.add_identifier(identifier)
                elif key == "DO":
                    # DOI
                    identifier = metajson_service.create_identifier(
                        "doi", value)
                    document.add_identifier(identifier)
                elif key == "SN":
                    # ISBN or ISSN ?
                    id_type = None
                    if rec_type in [
                            constants.DOC_TYPE_JOURNALARTICLE,
                            constants.DOC_TYPE_MAGAZINEARTICLE,
                            constants.DOC_TYPE_NEWSPAPERARTICLE,
                            constants.DOC_TYPE_JOURNAL
                    ]:
                        id_type = "issn"
                    else:
                        id_type = "isbn"
                    identifier = metajson_service.create_identifier(
                        id_type, value)
                    if is_part_of_rec_type is None:
                        document.add_identifier(identifier)
                    else:
                        document["is_part_ofs"][0].add_identifier(identifier)
                elif key == "CA":
                    document["caption"] = value
                elif key == "DB":
                    # Name of Database -> rec_source ?
                    document["rec_source"] = value
                elif key == "DP":
                    # NDatabase Provider -> rec_source ?
                    document["rec_source"] = value
                elif key == "KW":
                    if "keywords" not in document:
                        document["keywords"] = {"und": []}
                    document["keywords"]["und"].append(value)
                else:
                    logging.debug("Not managed key: {} with value: {}".format(
                        key, value))
Ejemplo n.º 2
0
def ris_txt_lines_to_metajson_list(txt_lines, source, rec_id_prefix, only_first_record):
    document = None
    ris_type = None
    rec_type = None
    is_part_of_rec_type = None
    previous_key = None
    previous_value = None
    for line in txt_lines:
        if line:
            line = line.rstrip('\r\n')
            #logging.debug("line: {}".format(line))

            # multi line management
            if previous_key:
                key = previous_key
                value = previous_value + line
                previous_key = None
                previous_value = None
            else:
                key = line[:2].strip()
                value = line[6:].strip()
            if value.endswith("/") and key not in ["Y1", "PY"]:
                #logging.debug("multi line")
                previous_key = key
                previous_value = value.rstrip('/')
                continue

            if key is None or len(key) == 0:
                # empty line -> continue
                #logging.debug("empty line")
                continue
            elif key == RIS_KEY_BEGIN:
                # record begin with document type -> create document
                # init
                document = Document()
                is_part_of_rec_type = None

                if source:
                    document["rec_source"] = source
                ris_type = value
                rec_type = ris_document_type_to_metajson_document_type[ris_type]
                document["rec_type"] = rec_type
                if ris_type in ris_document_type_to_metajson_document_is_part_of_type:
                    is_part_of_rec_type = ris_document_type_to_metajson_document_is_part_of_type[ris_type]
                    is_part_of = Document()
                    is_part_of["rec_type"] = is_part_of_rec_type
                    document["is_part_ofs"] = [is_part_of]
            elif key == RIS_KEY_END:
                # record end -> return the result
                # verify the is_part_ofs[0]["title"]
                if "is_part_ofs" in document and "title" not in document["is_part_ofs"][0] and "title_abbreviateds" in document["is_part_ofs"][0]:
                    document["is_part_ofs"][0]["title"] = document["is_part_ofs"][0]["title_abbreviateds"][0]["title"]
                    del document["is_part_ofs"][0]["title_abbreviateds"]

                logging.info("# RIS type: {}".format(ris_type))
                metajson_service.pretty_print_document(document)
                yield document
            else:
                # process key value
                #logging.debug("key: {}; value: {}".format(key, value))
                if key == "ID":
                    document["rec_id"] = value
                elif key in ["T1", "TI", "CT"] or (key == "BT" and ris_type in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]):
                    # Title Primary -> title
                    document["title"] = value
                elif key in ["JF", "JO"] or (key == "BT" and ris_type not in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]):
                    # Title Secondary -> is_part_of["title"]
                    document.add_is_part_of_title(value)
                elif key in ["JA", "J1", "J2", "T2"]:
                    # Title Secondary -> is_part_of["title_abbreviateds"][O]["title"]
                    document.add_is_part_of_title_abbreviated(value)
                elif key == "T3":
                    # Title Series
                    document.add_series_title(value)
                elif key in ["A1", "AU"]:
                    document.add_creator(creator_service.formatted_name_to_creator(value, None, "aut"))
                elif key in ["A2", "ED"]:
                    if is_part_of_rec_type:
                        document.add_is_part_of_creator(creator_service.formatted_name_to_creator(value, None, "edt"))
                    else:
                        document.add_creator(creator_service.formatted_name_to_creator(value, None, "edt"))
                elif key == "A3":
                    document.add_series_creator(creator_service.formatted_name_to_creator(value, None, "aut"))
                elif key == "A4":
                    document.add_creator(creator_service.formatted_name_to_creator(value, None, "ctb"))
                elif key in ["PY", "Y1", "DA"]:
                    index_slash = value.find("/")
                    if index_slash != -1:
                        # YYYY/MM/DD/other info (like season)
                        # todo
                        document["date_issued"] = value.strip("/")
                    else:
                        document["date_issued"] = value
                elif key == "SP":
                    document["part_page_begin"] = value
                elif key == "EP":
                    document["part_page_end"] = value
                elif key == "VL":
                    document["part_volume"] = value
                elif key in ["IS", "CP"]:
                    document["part_issue"] = value
                elif key in ["AB", "N2"]:
                    document["descriptions"] = [{"language": "und", "value": value}]
                elif key == "N1":
                    document["notes"] = [{"language": "und", "value": value}]
                elif key == "PB":
                    document.add_item_to_key(value, "publishers")
                elif key == "CY":
                    document.add_item_to_key(value, "publication_places")
                elif key == "RP":
                    document["publication_status"] = value
                elif key == "ET":
                    document["edition"] = value
                elif key == "UR":
                    resource = Resource()
                    resource["url"] = value
                    document.add_item_to_key(resource, "resources")
                elif key == "AN":
                    # Accession Number
                    identifier = metajson_service.create_identifier("accessionnumber", value)
                    document.add_identifier(identifier)
                elif key == "CN":
                    # Call Number
                    identifier = metajson_service.create_identifier("callnumber", value)
                    document.add_identifier(identifier)
                elif key == "DO":
                    # DOI
                    identifier = metajson_service.create_identifier("doi", value)
                    document.add_identifier(identifier)
                elif key == "SN":
                    # ISBN or ISSN ?
                    id_type = None
                    if rec_type in [constants.DOC_TYPE_JOURNALARTICLE, constants.DOC_TYPE_MAGAZINEARTICLE, constants.DOC_TYPE_NEWSPAPERARTICLE, constants.DOC_TYPE_JOURNAL]:
                        id_type = "issn"
                    else:
                        id_type = "isbn"
                    identifier = metajson_service.create_identifier(id_type, value)
                    if is_part_of_rec_type is None:
                        document.add_identifier(identifier)
                    else:
                        document["is_part_ofs"][0].add_identifier(identifier)
                elif key == "CA":
                    document["caption"] = value
                elif key == "DB":
                    # Name of Database -> rec_source ?
                    document["rec_source"] = value
                elif key == "DP":
                    # NDatabase Provider -> rec_source ?
                    document["rec_source"] = value
                elif key == "KW":
                    if "keywords" not in document:
                        document["keywords"] = {"und": []}
                    document["keywords"]["und"].append(value)
                else:
                    logging.debug("Not managed key: {} with value: {}".format(key, value))