Beispiel #1
0
def main():
    """
    Main program
    """
    for signal_type in [SIGTERM, SIGABRT]:
        signal(signal_type, clean_exit)

    parser = OptionParser("usage: %prog [options] accountName password gedcomFile")
    parser.add_option("-c", "--count", dest="count", default="999999",
                      help=SUPPRESS_HELP, metavar="NUMBER")
    parser.add_option("-i", "--ignore",
                      action="store_true", dest="ignore", default=False,
                      help="Ignore previously identified unavailable APID entries")
    parser.add_option("-l", "--logfile", dest="logfile", default="ancestry_extract.log",
                      help="Optional log file location", metavar="FILE")
    parser.add_option("-o", "--output", dest="output", default=".",
                      help="Output directory", metavar="DIR")
    parser.add_option("-r", "--resume",
                      action="store_true", dest="resume", default=False,
                      help="Resume if prior state found")
    parser.add_option("-s", "--screenshot",
                      action="store_true", dest="screenshot", default=False,
                      help="Generate source record screenshots")
    parser.add_option("-u", "--url", dest="ancestry", default="https://www.ancestry.com",
                      help="Override default https://www.ancestry.com URL")
    (options, args) = parser.parse_args()

    if len(args) != 3:
        print('Account name, password, and gedcom file are required arguments')
        sys.exit(1)
    if not os.path.isfile(args[2]):
        print('Gedcom file not found')
        sys.exit(1)
    with open(args[2], "r") as gedcom:
        gedcom_data = gedcom.read()
    if '1 SOUR Ancestry.com Family Trees' not in gedcom_data \
       or '2 CORP Ancestry.com' not in gedcom_data:
        print('Gedcom file does not appear to be from Ancestry.com')
        sys.exit(1)

    options.username = args[0]
    options.password = args[1]
    options.gedcom = args[2]

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(name)-8s %(levelname)-8s %(message)s',
                        filename=options.logfile,
                        filemode='a')

    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s %(message)s')
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)

    for check_dir in ['/media/dbid',
                      '/media/apid',
                      '/metadata/guid',
                      '/metadata/apid',
                      '/metadata/dbid']:
        if not os.path.isdir(options.output + check_dir):
            os.makedirs(options.output + check_dir)

    gedcom_queue = Queue()
    gedcom_process = Process(target=load_gedcom, args=(gedcom_queue, gedcom_data))
    gedcom_process.start()

    cache_queue = Queue()
    cache_process = Process(target=load_tables, args=(cache_queue, options.output))
    cache_process.start()

    logging.info('Launching browser')
    firefox_profile = FirefoxProfile()
    firefox_profile.set_preference("browser.startup.homepage", "about:blank")
    firefox_profile.set_preference("browser.download.folderList", 2)
    firefox_profile.set_preference("browser.download.panel.shown", False)
    firefox_profile.set_preference("browser.download.manager.showWhenStarting", False)
    firefox_profile.set_preference("browser.download.dir", options.output)
    firefox_profile.set_preference("browser.helperApps.neverAsk.saveToDisk",
                                   "application/octet-stream")
    firefox_profile.set_preference("places.history.enabled", False)
    firefox_options = Options()
    firefox_options.headless = True
    session = Firefox(options=firefox_options, firefox_profile=firefox_profile)
    atexit.register(session_cleanup, session)
    session.implicitly_wait(15)
    session.fullscreen_window()
    session.options = options
    login(session)

    result = cache_queue.get()
    session.checkpoint = result['checkpoint']
    session.tree_id = result['tree_id']
    session.tree_name = result['tree_name']
    session.unavailable = []
    if options.resume or options.ignore:
        session.unavailable = result['unavailable']
    session.hash_map = result['hash_map']
    session.images = result['image_cache']
    cache_process.join()

    result = gedcom_queue.get()
    people = result['people']
    people_total = len(people)
    family_total = result['families']
    apid_total = result['apid_total']
    apid_unique = result['apid_unique']
    guid_total = result['guid_total']
    guid_unique = result['guid_unique']
    gedcom_process.join()

    logging.info('Found %d people and %d families to process',
                 people_total,
                 family_total)
    logging.info('Found %d unique and %d total ancestry media items to process',
                 apid_unique,
                 apid_total)
    logging.info('Found %d unique and %d total user media items to process',
                 guid_unique,
                 guid_total)

    print_flag = False
    session.line_number = 0
    success = unavailable = duplicate = skip = timeouts = total = count = 0
    person_number = family_number = 0
    apid_number = guid_number = 0
    person = husband = wife = ''
    url_note = ''
    logging.info('Starting second pass processing Gedcom media items')
    for line in gedcom_data.split('\n'):
        session.line_number = session.line_number + 1
        if options.resume and session.line_number < session.checkpoint:
            continue
        options.resume = False
        if len(line) < 5:
            continue

        tag = line.split(' ')[1]
        if tag == 'SOUR':
            if session.line_number > session.checkpoint:
                session.checkpoint = session.line_number
            continue
        if '@P' in tag:
            person_number = person_number + 1
            husband = wife = ''
            person = people[tag]
            print_flag = False
            continue
        if '@F' in tag:
            family_number = family_number + 1
            husband = wife = person = ''
            print_flag = False
            continue
        if tag == 'HUSB':
            husband = people[line[7:]]
            continue
        if tag == 'WIFE':
            wife = people[line[7:]]
            continue
        if tag == 'NOTE':
            if 'http' in line:
                url_note = line[7:]
            continue

        if tag in ['FILE', '_APID']:
            total = total + 1
            if not print_flag:
                if session.line_number > session.checkpoint:
                    if person:
                        logging.info('Processing records for person %s (%d of %d)',
                                     person,
                                     person_number,
                                     people_total)
                    else:
                        who = join = ''
                        if husband != '':
                            who = husband
                            join = ' and '
                        if wife != '':
                            who = who + join + wife
                        logging.info('Processing records for family of %s (%d of %d)',
                                     who,
                                     family_number,
                                     family_total)
                    print_flag = True

            if ' FILE ' in line and 'f=image&guid=' in line:
                guid_number = guid_number + 1
                logging.debug('User media item %d of %d with %d unique',
                              guid_number,
                              guid_total,
                              guid_unique)
                result = user_media(session, line)
            if ' _APID ' in line:
                process_apid = True
                if options.ignore:
                    apid = line.split(' ').pop(2).strip()
                    if apid in session.unavailable:
                        process_apid = False
                        result = 'unavailable'
                if process_apid:
                    apid_number = apid_number + 1
                    if '::0' not in line:
                        logging.debug('Ancestry media item %d of %d with %d unique',
                                      apid_number,
                                      apid_total,
                                      apid_unique)
                        result = ancestry_media(session, line)

            if result == 'success':
                count = count + 1
                success = success + 1
            elif result == 'duplicate':
                duplicate = duplicate + 1
            elif result == 'unavailable':
                if person:
                    logging.info('Unavailable item for %s', person)
                else:
                    logging.info('Unavailable item for %s / %s', husband, wife)
                unavailable = unavailable + 1
            elif result == 'timeout':
                timeouts = timeouts + 1
            elif result == 'skip':
                skip = skip + 1

            if count == int(options.count):
                logging.info('Reached limit of %d records processed', count)
                break

    logging.info('Total overall records:            %d', total)
    logging.info('Total processed records:          %d', success)
    logging.info('Total duplicate records:          %d', duplicate)
    logging.info('Total unavailable records:        %d', unavailable)
    logging.info('Total skipped due to unavailable: %d', skip)
    logging.info('Total skipped due to timeouts:    %d', timeouts)
def main():
    """
    Main program
    """
    for signal_type in [SIGTERM, SIGABRT]:
        signal(signal_type, clean_exit)

    parser = argparse.ArgumentParser()
    parser.add_argument("-a", "--account", help="Account name")
    parser.add_argument("-c", "--config", help="Configuration file")
    parser.add_argument(
        "-C",
        "--citations",
        default=True,
        action="store_true",
        help="Save source images for citations",
    )
    parser.add_argument(
        "-g",
        "--gedcom",
        help="Gedcom file",
    )
    parser.add_argument(
        "-i",
        "--ignore",
        default=False,
        action="store_true",
        help="Ignore previously unavailable APID entries",
    )
    parser.add_argument(
        "-M",
        "--media",
        default=True,
        action="store_true",
        help="Save user media images",
    )
    parser.add_argument(
        "-N",
        "--newspapers",
        default=False,
        action="store_true",
        help="Save clipped newspaper images",
    )
    parser.add_argument("-o",
                        "--output",
                        help="Root of output directory structure")
    parser.add_argument("-p", "--password", help="Password")
    parser.add_argument(
        "-r",
        "--resume",
        default=False,
        action="store_true",
        help="Resume if prior state found",
    )
    parser.add_argument(
        "-S",
        "--screenshots",
        default=False,
        action="store_true",
        help="Save source citation screenshots",
    )
    parser.add_argument(
        "-u",
        "--url",
        dest="ancestry",
        default="https://www.ancestry.com",
        help="Override default https://www.ancestry.com",
    )
    args = parser.parse_args()

    if not args.account or not args.password or not args.gedcom:
        if not args.config:
            args.config = "ancestry_extract.toml"

    if args.config:
        if os.path.isfile(args.config):
            with open(args.config, "r") as config_file:
                config_data = toml.load(config_file)
            for key in config_data:
                setattr(args, key, config_data[key])

    if not args.account or not args.password or not args.gedcom:
        print("Account name, password, and gedcom file are required arguments")
        sys.exit(1)

    if not os.path.isfile(args.gedcom):
        print("Gedcom file not found")
        sys.exit(1)
    with open(args.gedcom, "r") as gedcom:
        gedcom_data = gedcom.read()
    if ("1 SOUR Ancestry.com Family Trees" not in gedcom_data
            or "2 CORP Ancestry.com" not in gedcom_data):
        print("Gedcom file does not appear to be from Ancestry.com")
        sys.exit(1)

    for check_dir in [
            "/logs",
            "/media/dbid",
            "/media/apid",
            "/metadata/guid",
            "/metadata/apid",
            "/metadata/dbid",
    ]:
        if not os.path.isdir(args.output + check_dir):
            os.makedirs(args.output + check_dir)

    log_file = (args.output + "/logs/" +
                pendulum.now().format("YYYY-MM-DD-HH-MM") +
                "-ancestry-extract.log")
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(name)-8s %(levelname)-8s %(message)s",
        filename=log_file,
        filemode="a",
    )

    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    formatter = logging.Formatter("%(asctime)s %(message)s")
    console.setFormatter(formatter)
    logging.getLogger("").addHandler(console)

    if args.config:
        logging.info("Config File:               " + args.config)
    logging.info("Gedcom File:               " + args.gedcom)
    logging.info("Output Tree:               " + args.output)
    logging.info("Save Citation Images:      " + str(args.citations))
    logging.info("Save Citation Screenshots: " + str(args.screenshots))
    logging.info("Save User Media:           " + str(args.media))
    logging.info("Save News Clippings:       " + str(args.newspapers))

    gedcom_queue = Queue()
    gedcom_process = Process(target=load_gedcom,
                             args=(gedcom_queue, gedcom_data))
    gedcom_process.start()

    cache_queue = Queue()
    cache_process = Process(target=load_tables,
                            args=(cache_queue, args.output))
    cache_process.start()

    logging.info("Launching browser")

    firefox_profile = FirefoxProfile()
    firefox_profile.set_preference("browser.startup.homepage", "about:blank")
    firefox_profile.set_preference("browser.download.folderList", 2)
    firefox_profile.set_preference("browser.download.panel.shown", False)
    firefox_profile.set_preference("browser.download.manager.showWhenStarting",
                                   False)
    firefox_profile.set_preference("browser.download.dir", args.output)
    firefox_profile.set_preference("browser.helperApps.neverAsk.saveToDisk",
                                   "application/octet-stream")
    firefox_profile.set_preference("places.history.enabled", False)
    firefox_options = Options()
    firefox_options.headless = True
    session = Firefox(options=firefox_options, firefox_profile=firefox_profile)

    atexit.register(session_cleanup, session)
    session.implicitly_wait(15)
    session.fullscreen_window()
    session.options = args
    login(session)

    result = cache_queue.get()
    session.checkpoint = result["checkpoint"]
    session.tree_id = result["tree_id"]
    session.tree_name = result["tree_name"]
    session.unavailable = []
    if args.resume or args.ignore:
        session.unavailable = result["unavailable"]
    session.hash_map = result["hash_map"]
    session.images = result["image_cache"]
    cache_process.join()

    result = gedcom_queue.get()
    people = result["people"]
    people_total = len(people)
    family_total = result["families"]
    apid_total = result["apid_total"]
    apid_unique = result["apid_unique"]
    guid_total = result["guid_total"]
    guid_unique = result["guid_unique"]
    gedcom_process.join()

    logging.info("Found %d people and %d families to process", people_total,
                 family_total)
    logging.info(
        "Found %d unique and %d total ancestry citations to process",
        apid_unique,
        apid_total,
    )
    logging.info(
        "Found %d unique and %d total user media items to process",
        guid_unique,
        guid_total,
    )

    print_flag = False
    session.line_number = 0
    success = unavailable = duplicate = skip = timeouts = total = count = 0
    person_number = family_number = 0
    apid_number = guid_number = 0
    person = husband = wife = ""
    url_note = ""
    logging.info("Starting second pass Gedcom processing")
    for line in gedcom_data.split("\n"):
        session.line_number = session.line_number + 1
        if args.resume and session.line_number < session.checkpoint:
            continue
        args.resume = False
        if len(line) < 5:
            continue

        if line[0] == 1:
            # reset the url note for new records
            url_note = ""

        tag = line.split(" ")[1]
        if tag == "SOUR":
            if session.line_number > session.checkpoint:
                session.checkpoint = session.line_number
            continue
        if "@P" in tag:
            person_number = person_number + 1
            husband = wife = ""
            person = people[tag]
            print_flag = False
            continue
        if "@F" in tag:
            family_number = family_number + 1
            husband = wife = person = ""
            print_flag = False
            continue
        if tag == "HUSB":
            husband = people[line[7:]]
            continue
        if tag == "WIFE":
            wife = people[line[7:]]
            continue
        if tag == "NOTE":
            if "http" in line:
                url_note = line[7:]
            continue

        if tag in ["FILE", "_APID"]:
            total = total + 1
            if not print_flag:
                if session.line_number > session.checkpoint:
                    if person:
                        logging.info(
                            "Processing records for person %s (%d of %d)",
                            person,
                            person_number,
                            people_total,
                        )
                    else:
                        who = join = ""
                        if husband != "":
                            who = husband
                            join = " and "
                        if wife != "":
                            who = who + join + wife
                        logging.info(
                            "Processing records for family of %s (%d of %d)",
                            who,
                            family_number,
                            family_total,
                        )
                    print_flag = True

            if args.media and " FILE " in line and "f=image&guid=" in line:
                guid_number = guid_number + 1
                logging.debug(
                    "User media item %d of %d with %d unique",
                    guid_number,
                    guid_total,
                    guid_unique,
                )
                result = get_user_media(session, line, url_note)
                url_note = ""
            if args.citations and " _APID " in line:
                process_apid = True
                if args.ignore:
                    apid = line.split(" ").pop(2).strip()
                    if apid in session.unavailable:
                        process_apid = False
                        result = "unavailable"
                if process_apid:
                    apid_number = apid_number + 1
                    if "::0" not in line:
                        logging.debug(
                            "Source citation media item %d of %d with %d unique",
                            apid_number,
                            apid_total,
                            apid_unique,
                        )
                        result = get_citation_media(session, line)

            if result == "success":
                count = count + 1
                success = success + 1
            elif result == "duplicate":
                duplicate = duplicate + 1
            elif result == "unavailable":
                if person:
                    logging.info("Unavailable item for %s", person)
                else:
                    logging.info("Unavailable item for %s / %s", husband, wife)
                unavailable = unavailable + 1
            elif result == "timeout":
                timeouts = timeouts + 1
            elif result == "skip":
                skip = skip + 1

    logging.info("Total overall records:            %d", total)
    logging.info("Total processed records:          %d", success)
    logging.info("Total duplicate records:          %d", duplicate)
    logging.info("Total unavailable records:        %d", unavailable)
    logging.info("Total skipped due to unavailable: %d", skip)
    logging.info("Total skipped due to timeouts:    %d", timeouts)