def create_config(project_name, project_label, pipeline, mailing_list): if "pipeline" in mailing_list: pipeline = mailing_list["pipeline"] # individual mailing lists may override config = { 'id': "mailing-list-{0}-{1}".format(project_name, mailing_list["name"]), "connector": "lucid.anda", "type": "web", 'pipeline': pipeline, "properties": { "refreshOlderThan": -1, "f.appendTrailingSlashToLinks": False, "refreshErrors": False, "restrictToTreeIgnoredHostPrefixes": [ "www." ], "dedupeSaveSignature": False, "crawlDBType": "in-memory", "f.discardLinkURLQueries": False, "f.respectMetaEquivRedirects": False, "fetchDelayMS": 50, "splitArchives": True, "refreshAll": True, "f.defaultMIMEType": "application/octet-stream", "restrictToTreeAllowSubdomains": False, "maxItems": -1, "f.scrapeLinksBeforeFiltering": False, "dedupe": False, "f.allowAllCertificates": False, "collection": "lucidfind", "forceRefresh": False, "f.obeyRobots": True, "fetchDelayMSPerHost": True, "indexCrawlDBToSolr": False, "fetchThreads": 1, "restrictToTree": True, "retainOutlinks": True, "f.defaultCharSet": "UTF-8", "emitThreads": 1, "diagnosticMode": False, "delete": True, "f.userAgentWebAddr": "", "initial_mapping": { "id": "FromMap", "mappings": [ {"source": "project", "target": project_name, "operation": "set"}, {"source": "project_label", "target": project_label, "operation": "set"}, {"source": "datasource_label", "target": mailing_list["label"], "operation": "set"}, {"source": "fetchedDate", "target": "publishedOnDate", "operation": "copy"}, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" } ], "reservedFieldsMappingAllowed": False, "skip": False, "label": "field-mapping", "type": "field-mapping" }, "restrictToTreeUseHostAndPath": True, "f.filteringRootTags": [ "body", "head" ], "f.userAgentEmail": "", "f.timeoutMS": 10000, "failFastOnStartLinkFailure": True, "startLinks": [ app.config.get("ASF_MAIL_ARCHIVE_BASE_URL", "http://asfmail.lucidworks.io/mail_files/") + mailing_list["mbox"] ], "chunkSize": 100, "includeRegexes": [], "f.obeyRobotsDelay": True, "deleteErrorsAfter": -1, "f.userAgentName": "Lucidworks-Anda/2.0", "retryEmit": True, "depth": -1, "refreshStartLinks": False, "f.maxSizeBytes": 4194304, "aliasExpiration": 1 } } schedule = None if "schedule" in mailing_list: details = mailing_list["schedule"] schedule = create_schedule(details, config["id"]) return config, schedule
def create_config(project_name, project_label, includes, excludes, schedule, pipeline, stacks): stack_links = [] for stack in stacks: stack_links.append("http://stackoverflow.com/questions/tagged/" + stack["tag"]) config = { 'id': "stack-{0}".format(project_name), "connector": "lucid.web", "type": "web", 'pipeline': pipeline, "parserId": "default", "properties": { "refreshOlderThan": -1, "f.appendTrailingSlashToLinks": False, "refreshErrors": False, "restrictToTreeIgnoredHostPrefixes": ["www."], "f.excludeSelectors": [ "div.related", ".topbar", ".bottom-notice", "#hot-network-questions", "#header", ".hero-content", "#footer" ], "f.selectorFields": [ ".post-taglist a.post-tag", ".accepted-answer .answercell", ".answercell", ".postcell .post-text" ], "f.includeSelectors": ["#mainbar"], "dedupeSaveSignature": False, "crawlDBType": "in-memory", "f.discardLinkURLQueries": False, "f.respectMetaEquivRedirects": False, "fetchDelayMS": 1000, "refreshAll": True, "f.defaultMIMEType": "application/octet-stream", "restrictToTreeAllowSubdomains": False, "maxItems": -1, "f.scrapeLinksBeforeFiltering": False, "dedupe": False, "f.allowAllCertificates": False, "collection": "lucidfind", # TODO: don't hardcode "forceRefresh": False, "f.obeyRobots": True, "fetchDelayMSPerHost": True, "indexCrawlDBToSolr": False, "fetchThreads": 1, "restrictToTree": False, "retainOutlinks": True, "f.defaultCharSet": "UTF-8", "emitThreads": 1, "excludeExtensions": [".class", ".bin", ".jar"], "diagnosticMode": False, "delete": True, "f.userAgentWebAddr": "", "initial_mapping": { "id": "FromMap", "mappings": [ { "source": "project", "target": project_name, "operation": "set" }, { "source": "project_label", "target": project_label, "operation": "set" }, { "source": "datasource_label", "target": project_label, "operation": "set" }, # we only have one crawler for SO { "source": "fetchedDate", "target": "publishedOnDate", "operation": "copy" }, { "source": "isBot", "target": "false", "operation": "set" }, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" } ], "reservedFieldsMappingAllowed": False, "skip": False, "label": "field-mapping", "type": "field-mapping" }, "restrictToTreeUseHostAndPath": True, "f.filteringRootTags": ["body", "head"], "f.userAgentEmail": "", "f.timeoutMS": 10000, "failFastOnStartLinkFailure": True, "startLinks": stack_links, "chunkSize": 100, "includeRegexes": [], "f.obeyRobotsDelay": True, "deleteErrorsAfter": -1, "f.userAgentName": "Lucidworks-Anda/2.0", "retryEmit": True, "depth": -1, "refreshStartLinks": False, "f.maxSizeBytes": 4194304, "aliasExpiration": 1 } } if excludes: config['properties']['excludeRegexes'] = excludes if includes: config['properties']['includeRegexes'] = includes if schedule: details = schedule schedule = create_schedule(details, config["id"]) return config, schedule
def create_jira_datasource_config(project): """ Generate the JIRA data source config for a given project :param project: the project :returns: the configuration dictionary """ pipeline = project["jira_pipeline"] jira = project["jira"] if "pipeline" in jira: pipeline = jira["pipeline"] if pipeline is None: pipeline = "jira-default" config = {"id": "jira-{0}-{1}".format(project["name"], jira["name"]), "connector": "lucid.anda", "type": "jira", "pipeline": pipeline, "properties": { "collection": "lucidfind", "startLinks": [jira["url"]]}, "initial_mapping": { "id": "FromMap", "mappings": [ {"source": "project", "target": project["name"], "operation": "set"}, {"source": "project_label", "target": project["label"], "operation": "set"}, {"source": "datasource_label", "target": jira["label"], "operation": "set"}, {"source": "isBot", "target": "false", "operation": "set"}, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" } ], "reservedFieldsMappingAllowed": False, "skip": False, "label": "field-mapping", "type": "field-mapping" } } schedule = None if "schedule" in jira: details = jira["schedule"] schedule = create_schedule(details, config["id"]) if "jira_user" in jira: config['properties']["f.jira_username"] = jira["jira_user"], config['properties']["f.jira_password"] = app.config.get(jira["jira_pass"]), return (config, schedule)
def create_config(project_name, project_label, pipeline, mailing_list): if "pipeline" in mailing_list: pipeline = mailing_list[ "pipeline"] # individual mailing lists may override config = { 'id': "mailing-list-{0}-{1}".format(project_name, mailing_list["name"]), "connector": "lucid.anda", "type": "web", 'pipeline': pipeline, "properties": { "refreshOlderThan": -1, "f.appendTrailingSlashToLinks": False, "refreshErrors": False, "restrictToTreeIgnoredHostPrefixes": ["www."], "dedupeSaveSignature": False, "crawlDBType": "in-memory", "f.discardLinkURLQueries": False, "f.respectMetaEquivRedirects": False, "fetchDelayMS": 0, "splitArchives": True, "refreshAll": True, "f.defaultMIMEType": "application/octet-stream", "restrictToTreeAllowSubdomains": False, "maxItems": -1, "f.scrapeLinksBeforeFiltering": False, "dedupe": False, "f.allowAllCertificates": False, "collection": "lucidfind", "forceRefresh": False, "f.obeyRobots": True, "fetchDelayMSPerHost": True, "indexCrawlDBToSolr": False, "fetchThreads": 5, "restrictToTree": True, "retainOutlinks": True, "f.defaultCharSet": "UTF-8", "emitThreads": 5, "diagnosticMode": False, "delete": True, "f.userAgentWebAddr": "", "initial_mapping": { "id": "FromMap", "mappings": [{ "source": "project", "target": project_name, "operation": "set" }, { "source": "project_label", "target": project_label, "operation": "set" }, { "source": "datasource_label", "target": mailing_list["label"], "operation": "set" }, { "source": "fetchedDate", "target": "publishedOnDate", "operation": "copy" }, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" }], "reservedFieldsMappingAllowed": False, "skip": False, "label": "field-mapping", "type": "field-mapping" }, "restrictToTreeUseHostAndPath": True, "f.filteringRootTags": ["body", "head"], "f.userAgentEmail": "", "f.timeoutMS": 10000, "failFastOnStartLinkFailure": True, "startLinks": [ "http://asfmail.lucidworks.io/mail_files/" + mailing_list["mbox"] ], "chunkSize": 50, "includeRegexes": [], "f.obeyRobotsDelay": True, "deleteErrorsAfter": -1, "f.userAgentName": "Lucidworks-Anda/2.0", "retryEmit": True, "depth": -1, "refreshStartLinks": False, "f.maxSizeBytes": 4194304, "aliasExpiration": 1 } } schedule = None if "schedule" in mailing_list: details = mailing_list["schedule"] schedule = create_schedule(details, config["id"]) return config, schedule
def create_config(project_name, project_label, pipeline, website): if "pipeline" in website: pipeline = website["pipeline"] # individual mailing lists may override config = { 'id': "website-{0}-{1}".format(project_name, website["name"]), "connector": "lucid.web", "type": "web", 'pipeline': pipeline, "parserId": "default", "properties": { "refreshOlderThan": -1, "f.appendTrailingSlashToLinks": False, "refreshErrors": False, "restrictToTreeIgnoredHostPrefixes": [ "www." ], "dedupeSaveSignature": False, "crawlDBType": "in-memory", "f.discardLinkURLQueries": False, "f.respectMetaEquivRedirects": False, "fetchDelayMS": 50, "refreshAll": True, "f.defaultMIMEType": "application/octet-stream", "restrictToTreeAllowSubdomains": False, "maxItems": -1, "f.scrapeLinksBeforeFiltering": False, "dedupe": False, "f.allowAllCertificates": False, "collection": "lucidfind", #TODO: don't hardcode "forceRefresh": False, "f.obeyRobots": True, "fetchDelayMSPerHost": True, "indexCrawlDBToSolr": False, "fetchThreads": 1, "restrictToTree": True, "retainOutlinks": True, "f.defaultCharSet": "UTF-8", "emitThreads": 1, "excludeExtensions": [ ".class", ".bin", ".jar" ], "diagnosticMode": False, "delete": True, "f.userAgentWebAddr": "", "initial_mapping": { "id": "FromMap", "mappings": [ {"source": "project", "target": project_name, "operation": "set"}, {"source": "project_label", "target": project_label, "operation": "set"}, {"source": "datasource_label", "target": website["label"], "operation": "set"}, {"source": "fetchedDate", "target": "publishedOnDate", "operation": "copy"}, {"source": "isBot", "target": "false", "operation": "set"}, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" } ], "reservedFieldsMappingAllowed": False, "skip": False, "label": "field-mapping", "type": "field-mapping" }, "restrictToTreeUseHostAndPath": True, "f.filteringRootTags": [ "body", "head" ], "f.userAgentEmail": "", "f.timeoutMS": 10000, "failFastOnStartLinkFailure": True, "startLinks": [ website["url"] ], "chunkSize": 100, "includeRegexes": [], "f.obeyRobotsDelay": True, "deleteErrorsAfter": -1, "f.userAgentName": "Lucidworks-Anda/2.0", "retryEmit": True, "depth": -1, "refreshStartLinks": False, "f.maxSizeBytes": 4194304, "aliasExpiration": 1 } } if "excludes" in website: config['properties']['excludeRegexes'] = website["excludes"] if "includeRegexes" in website: config['properties']['includeRegexes'] = website["includeRegexes"] if "includeTags" in website: config['properties']['f.includeTags'] = website["includeTags"] if "excludeTags" in website: config['properties']['f.excludeTags'] = website["excludeTags"] if "includeTagIDs" in website: config['properties']['f.includeTagIDs'] = website["includeTagIDs"] if "excludeTagIDs" in website: config['properties']['f.excludeTagIDs'] = website["excludeTagIDs"] if "excludeTagClasses" in website: config['properties']['f.excludeTagClasses'] = website["excludeTagClasses"] if "scrapeLinksBeforeFiltering" in website: config['properties']['f.scrapeLinksBeforeFiltering'] = website["scrapeLinksBeforeFiltering"] if "restrictToTreeUseHostAndPath" in website: config['properties']['restrictToTreeUseHostAndPath'] = website["restrictToTreeUseHostAndPath"] if "multiurl" in website: config['properties']['startLinks'] = website["multiurl"] if "additional_mapping" in website: config['properties']['initial_mapping']['mappings'].append(website["additional_mapping"]) if "additional_mapping_2" in website: config['properties']['initial_mapping']['mappings'].append(website["additional_mapping_2"]) schedule = None if "schedule" in website: details = website["schedule"] schedule = create_schedule(details, config["id"]) return config, schedule
def create_config(project_name, project_label, pipeline, repo): if "pipeline" in repo: pipeline = repo["pipeline"] # individual mailing lists may override config = { "id": "github-{0}-{1}".format(project_name, repo["name"]), "connector": "lucid.anda", "type": "github", "pipeline": pipeline, "properties": { "collection": "lucidfind", # TODO: don't hardcode "startLinks": [repo["url"]], "f.blobs": repo.get("blobs", True), "f.branches": repo.get("branches", True), "f.commits": repo.get("commits", False), "f.issues": repo.get("issues", False), "f.pull_requests": repo.get("pull_requests", False), "f.pull_request_comments": repo.get("pull_request_comments", False), "f.milestones": repo.get("milestones", False), "f.commit_diffs": repo.get("commit_diffs", False), "f.releases": repo.get("releases", False), "fetchThreads": 1, "initial_mapping": { "mappings": [{ "source": "project", "target": project_name, "operation": "set" }, { "source": "project_label", "target": project_label, "operation": "set" }, { "source": "datasource_label", "target": repo["label"], "operation": "set" }, { "source": "isBot", "target": "false", "operation": "set" }, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" }], "reservedFieldsMappingAllowed": False, "skip": False, "id": "Anda", "label": "field-mapping", "type": "field-mapping" } } } if "github_user" in repo: config['properties']["f.github_username"] = repo["github_user"] config['properties']["f.github_password"] = app.config.get( repo["github_pass"]) # TODO: encrypt if "includes" in repo: config['properties']['includeRegexes'] = [repo["includes"]] if "excludes" in repo: config['properties']['excludeRegexes'] = [repo["excludes"]] schedule = None if "schedule" in repo: details = repo["schedule"] schedule = create_schedule(details, config["id"]) return config, schedule
def create_config(project_name, project_label, pipeline, repo): if "pipeline" in repo: pipeline = repo["pipeline"] # individual mailing lists may override config = {"id": "github-{0}-{1}".format(project_name, repo["name"]), "connector": "lucid.anda", "type": "github", "pipeline": pipeline, "properties": { "collection": "lucidfind", # TODO: don't hardcode "startLinks": [repo["url"]], "f.blobs": repo.get("blobs", True), "f.branches": repo.get("branches", True), "f.commits": repo.get("commits", False), "f.issues": repo.get("issues", False), "f.pull_requests": repo.get("pull_requests", False), "f.pull_request_comments": repo.get("pull_request_comments", False), "f.milestones": repo.get("milestones", False), "f.commit_diffs": repo.get("commit_diffs", False), "f.releases": repo.get("releases", False), "fetchThreads": 1, "initial_mapping": { "mappings": [ {"source": "project", "target": project_name, "operation": "set"}, {"source": "project_label", "target": project_label, "operation": "set"}, {"source": "datasource_label", "target": repo["label"], "operation": "set"}, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" } ], "reservedFieldsMappingAllowed": False, "skip": False, "id": "Anda", "label": "field-mapping", "type": "field-mapping" } } } if "github_user" in repo: config['properties']["f.github_username"] = repo["github_user"] config['properties']["f.github_password"] = app.config.get(repo["github_pass"]) # TODO: encrypt if "includes" in repo: config['properties']['includeRegexes'] = [repo["includes"]] if "excludes" in repo: config['properties']['excludeRegexes'] = [repo["excludes"]] schedule = None if "schedule" in repo: details = repo["schedule"] schedule = create_schedule(details, config["id"]) return config, schedule
def create_jira_datasource_config(project): """ Generate the JIRA data source config for a given project :param project: the project :returns: the configuration dictionary """ pipeline = project["jira_pipeline"] jira = project["jira"] if "pipeline" in jira: pipeline = jira["pipeline"] if pipeline is None: pipeline = "jira-default" config = { "id": "jira-{0}-{1}".format(project["name"], jira["name"]), "connector": "lucid.anda", "type": "jira", "pipeline": pipeline, "properties": { "collection": "lucidfind", "startLinks": [jira["url"]] }, "initial_mapping": { "id": "FromMap", "mappings": [{ "source": "project", "target": project["name"], "operation": "set" }, { "source": "project_label", "target": project["label"], "operation": "set" }, { "source": "datasource_label", "target": jira["label"], "operation": "set" }, { "source": "isBot", "target": "false", "operation": "set" }, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" }], "reservedFieldsMappingAllowed": False, "skip": False, "label": "field-mapping", "type": "field-mapping" } } schedule = None if "schedule" in jira: details = jira["schedule"] schedule = create_schedule(details, config["id"]) if "jira_user" in jira: config['properties']["f.jira_username"] = jira["jira_user"], config['properties']["f.jira_password"] = app.config.get( jira["jira_pass"]), return (config, schedule)
def create_config(project_name, project_label, pipeline, mailing_list): if "pipeline" in mailing_list: pipeline = mailing_list["pipeline"] # individual mailing lists may override config = { 'id': "mailing-list-{0}-{1}".format(project_name, mailing_list["name"]), "connector": "lucid.web", "type": "web", 'pipeline': pipeline, #"parserId": "default", "properties": { "refreshOlderThan": -1, "f.appendTrailingSlashToLinks": False, "refreshErrors": False, "restrictToTreeIgnoredHostPrefixes": [ "www." ], "dedupeSaveSignature": False, "crawlDBType": "in-memory", "f.discardLinkURLQueries": False, "f.respectMetaEquivRedirects": False, "fetchDelayMS": 50, #"splitArchives": True, "refreshAll": True, #"refreshScript": "function shouldRefresh(id, depth, lastModified, lastFetched, lastEmitted, error){\n\tvar date = new Date();\n var month = \"\";\n if (date.getUTCMonth() < 10){\n month = \"0\" + (date.getUTCMonth() + 1);\n } else {\n month = date.getUTCMonth() + 1\n }\n var yearMonth = date.getUTCFullYear() + \"\" + (month);//need 1 based dates\n if (id != null){\n return id.indexOf(yearMonth) != -1\n }\n return false;\n}\n", "f.defaultMIMEType": "application/octet-stream", "restrictToTreeAllowSubdomains": False, "maxItems": -1, "f.scrapeLinksBeforeFiltering": False, "dedupe": False, "f.allowAllCertificates": False, "collection": "lucidfind", "forceRefresh": False, "f.obeyRobots": True, "fetchDelayMSPerHost": True, "indexCrawlDBToSolr": False, "fetchThreads": 1, "restrictToTree": True, "retainOutlinks": True, "f.defaultCharSet": "UTF-8", "emitThreads": 1, "diagnosticMode": False, "delete": True, "f.userAgentWebAddr": "", "initial_mapping": { "id": "FromMap", "mappings": [ {"source": "project", "target": project_name, "operation": "set"}, {"source": "project_label", "target": project_label, "operation": "set"}, {"source": "datasource_label", "target": mailing_list["label"], "operation": "set"}, {"source": "fetchedDate", "target": "publishedOnDate", "operation": "copy"}, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" } ], "reservedFieldsMappingAllowed": False, "skip": False, "label": "field-mapping", "type": "field-mapping" }, "restrictToTreeUseHostAndPath": True, "f.filteringRootTags": [ "body", "head" ], "f.userAgentEmail": "", "f.timeoutMS": 10000, "failFastOnStartLinkFailure": True, "startLinks": [ app.config.get("ASF_MAIL_ARCHIVE_BASE_URL", "http://asfmail.lucidworks.io/mail_files/") + mailing_list["mbox"] ], "chunkSize": 100, "includeRegexes": [], "f.obeyRobotsDelay": True, "deleteErrorsAfter": -1, "f.userAgentName": "Lucidworks-Anda/2.0", "retryEmit": True, "depth": -1, "refreshStartLinks": False, "f.maxSizeBytes": 4194304, "aliasExpiration": 1 } } #automatically exclude date/author sorted links and atom: # http://asfmail.lucidworks.io/mail_files/ambari-dev/201412.mbox/date and http://asfmail.lucidworks.io/mail_files/ambari-dev/201412.mbox/author # http://asfmail.lucidworks.io/mail_files/ambari-dev/?format=atom config['properties']['excludeRegexes'] = [".*/date.*", ".*/author.*", ".*format=atom.*"] if "excludes" in mailing_list: config['properties']['excludeRegexes'].append(mailing_list["excludes"]) schedule = None if "schedule" in mailing_list: details = mailing_list["schedule"] schedule = create_schedule(details, config["id"]) return config, schedule
def create_config(project_name, project_label, includes, excludes, schedule, pipeline, stacks): stack_links = [] for stack in stacks: stack_links.append("http://stackoverflow.com/questions/tagged/" + stack["tag"]) config = { 'id': "stack-{0}".format(project_name), "connector": "lucid.web", "type": "web", 'pipeline': pipeline, "parserId": "default", "properties": { "refreshOlderThan": -1, "f.appendTrailingSlashToLinks": False, "refreshErrors": False, "restrictToTreeIgnoredHostPrefixes": [ "www." ], "f.excludeSelectors": [ "div.related", ".topbar", ".bottom-notice", "#hot-network-questions", "#header", ".hero-content", "#footer" ], "f.selectorFields": [ ".post-taglist a.post-tag", ".accepted-answer .answercell", ".answercell", ".postcell .post-text"], "f.includeSelectors": [ "#mainbar" ], "dedupeSaveSignature": False, "crawlDBType": "in-memory", "f.discardLinkURLQueries": False, "f.respectMetaEquivRedirects": False, "fetchDelayMS": 1000, "refreshAll": True, "f.defaultMIMEType": "application/octet-stream", "restrictToTreeAllowSubdomains": False, "maxItems": -1, "f.scrapeLinksBeforeFiltering": False, "dedupe": False, "f.allowAllCertificates": False, "collection": "lucidfind", # TODO: don't hardcode "forceRefresh": False, "f.obeyRobots": True, "fetchDelayMSPerHost": True, "indexCrawlDBToSolr": False, "fetchThreads": 1, "restrictToTree": False, "retainOutlinks": True, "f.defaultCharSet": "UTF-8", "emitThreads": 1, "excludeExtensions": [ ".class", ".bin", ".jar" ], "diagnosticMode": False, "delete": True, "f.userAgentWebAddr": "", "initial_mapping": { "id": "FromMap", "mappings": [ {"source": "project", "target": project_name, "operation": "set"}, {"source": "project_label", "target": project_label, "operation": "set"}, {"source": "datasource_label", "target": project_label, "operation": "set"}, # we only have one crawler for SO {"source": "fetchedDate", "target": "publishedOnDate", "operation": "copy"}, {"source": "isBot", "target": "false", "operation": "set"}, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" } ], "reservedFieldsMappingAllowed": False, "skip": False, "label": "field-mapping", "type": "field-mapping" }, "restrictToTreeUseHostAndPath": True, "f.filteringRootTags": [ "body", "head" ], "f.userAgentEmail": "", "f.timeoutMS": 10000, "failFastOnStartLinkFailure": True, "startLinks": stack_links, "chunkSize": 100, "includeRegexes": [], "f.obeyRobotsDelay": True, "deleteErrorsAfter": -1, "f.userAgentName": "Lucidworks-Anda/2.0", "retryEmit": True, "depth": -1, "refreshStartLinks": False, "f.maxSizeBytes": 4194304, "aliasExpiration": 1 } } if excludes: config['properties']['excludeRegexes'] = excludes if includes: config['properties']['includeRegexes'] = includes if schedule: details = schedule schedule = create_schedule(details, config["id"]) return config, schedule
def create_config(project_name, project_label, pipeline, mailing_list): if "pipeline" in mailing_list: pipeline = mailing_list["pipeline"] # individual mailing lists may override config = { 'id': "mailing-list-{0}-{1}".format(project_name, mailing_list["name"]), "connector": "lucid.anda", "type": "web", 'pipeline': pipeline, #"parserId": "default", "properties": { "refreshOlderThan": -1, "f.appendTrailingSlashToLinks": False, "refreshErrors": False, "restrictToTreeIgnoredHostPrefixes": [ "www." ], "dedupeSaveSignature": False, "crawlDBType": "in-memory", "f.discardLinkURLQueries": False, "f.respectMetaEquivRedirects": False, "fetchDelayMS": 50, #"splitArchives": True, "refreshAll": True, #"refreshScript": "function shouldRefresh(id, depth, lastModified, lastFetched, lastEmitted, error){\n\tvar date = new Date();\n var month = \"\";\n if (date.getUTCMonth() < 10){\n month = \"0\" + (date.getUTCMonth() + 1);\n } else {\n month = date.getUTCMonth() + 1\n }\n var yearMonth = date.getUTCFullYear() + \"\" + (month);//need 1 based dates\n if (id != null){\n return id.indexOf(yearMonth) != -1\n }\n return false;\n}\n", "f.defaultMIMEType": "application/octet-stream", "restrictToTreeAllowSubdomains": False, "maxItems": -1, "f.scrapeLinksBeforeFiltering": False, "dedupe": False, "f.allowAllCertificates": False, "collection": "lucidfind", "forceRefresh": False, "f.obeyRobots": True, "fetchDelayMSPerHost": True, "indexCrawlDBToSolr": False, "fetchThreads": 1, "restrictToTree": True, "retainOutlinks": True, "f.defaultCharSet": "UTF-8", "emitThreads": 1, "diagnosticMode": False, "delete": True, "f.userAgentWebAddr": "", "initial_mapping": { "id": "FromMap", "mappings": [ {"source": "project", "target": project_name, "operation": "set"}, {"source": "project_label", "target": project_label, "operation": "set"}, {"source": "datasource_label", "target": mailing_list["label"], "operation": "set"}, {"source": "fetchedDate", "target": "publishedOnDate", "operation": "copy"}, { "source": "charSet", "target": "charSet_s", "operation": "move" }, { "source": "fetchedDate", "target": "fetchedDate_dt", "operation": "move" }, { "source": "lastModified", "target": "lastModified_dt", "operation": "move" }, { "source": "signature", "target": "dedupeSignature_s", "operation": "move" }, { "source": "contentSignature", "target": "signature_s", "operation": "move" }, { "source": "length", "target": "length_l", "operation": "move" }, { "source": "mimeType", "target": "mimeType_s", "operation": "move" }, { "source": "parent", "target": "parent_s", "operation": "move" }, { "source": "owner", "target": "owner_s", "operation": "move" }, { "source": "group", "target": "group_s", "operation": "move" } ], "reservedFieldsMappingAllowed": False, "skip": False, "label": "field-mapping", "type": "field-mapping" }, "restrictToTreeUseHostAndPath": True, "f.filteringRootTags": [ "body", "head" ], "f.userAgentEmail": "", "f.timeoutMS": 10000, "failFastOnStartLinkFailure": True, "startLinks": [ app.config.get("ASF_MAIL_ARCHIVE_BASE_URL", "http://asfmail.lucidworks.io/mail_files/") + mailing_list["mbox"] ], "chunkSize": 100, "includeRegexes": [], "f.obeyRobotsDelay": True, "deleteErrorsAfter": -1, "f.userAgentName": "Lucidworks-Anda/2.0", "retryEmit": True, "depth": -1, "refreshStartLinks": False, "f.maxSizeBytes": 4194304, "aliasExpiration": 1 } } #automatically exclude date/author sorted links and atom: # http://asfmail.lucidworks.io/mail_files/ambari-dev/201412.mbox/date and http://asfmail.lucidworks.io/mail_files/ambari-dev/201412.mbox/author # http://asfmail.lucidworks.io/mail_files/ambari-dev/?format=atom config['properties']['excludeRegexes'] = [".*/date.*", ".*/author.*", ".*format=atom.*"] if "excludes" in mailing_list: config['properties']['excludeRegexes'].append(mailing_list["excludes"]) schedule = None if "schedule" in mailing_list: details = mailing_list["schedule"] schedule = create_schedule(details, config["id"]) return config, schedule