Esempio n. 1
0
def create_config(project_name, project_label, pipeline, mailing_list):
  if "pipeline" in mailing_list:
    pipeline = mailing_list["pipeline"]  # individual mailing lists may override
  config = {
    'id': "mailing-list-{0}-{1}".format(project_name, mailing_list["name"]),
    "connector": "lucid.anda",
    "type": "web",
    'pipeline': pipeline,
    "properties": {
      "refreshOlderThan": -1,
      "f.appendTrailingSlashToLinks": False,
      "refreshErrors": False,
      "restrictToTreeIgnoredHostPrefixes": [
        "www."
      ],
      "dedupeSaveSignature": False,
      "crawlDBType": "in-memory",
      "f.discardLinkURLQueries": False,
      "f.respectMetaEquivRedirects": False,
      "fetchDelayMS": 50,
      "splitArchives": True,
      "refreshAll": True,
      "f.defaultMIMEType": "application/octet-stream",
      "restrictToTreeAllowSubdomains": False,
      "maxItems": -1,
      "f.scrapeLinksBeforeFiltering": False,
      "dedupe": False,
      "f.allowAllCertificates": False,
      "collection": "lucidfind",
      "forceRefresh": False,
      "f.obeyRobots": True,
      "fetchDelayMSPerHost": True,
      "indexCrawlDBToSolr": False,
      "fetchThreads": 1,
      "restrictToTree": True,
      "retainOutlinks": True,
      "f.defaultCharSet": "UTF-8",
      "emitThreads": 1,
      "diagnosticMode": False,
      "delete": True,
      "f.userAgentWebAddr": "",
      "initial_mapping": {
        "id": "FromMap",
        "mappings": [
          {"source": "project", "target": project_name, "operation": "set"},
          {"source": "project_label", "target": project_label, "operation": "set"},
          {"source": "datasource_label", "target": mailing_list["label"], "operation": "set"},
          {"source": "fetchedDate", "target": "publishedOnDate", "operation": "copy"},
          {
            "source": "charSet",
            "target": "charSet_s",
            "operation": "move"
          },
          {
            "source": "fetchedDate",
            "target": "fetchedDate_dt",
            "operation": "move"
          },
          {
            "source": "lastModified",
            "target": "lastModified_dt",
            "operation": "move"
          },
          {
            "source": "signature",
            "target": "dedupeSignature_s",
            "operation": "move"
          },
          {
            "source": "contentSignature",
            "target": "signature_s",
            "operation": "move"
          },
          {
            "source": "length",
            "target": "length_l",
            "operation": "move"
          },
          {
            "source": "mimeType",
            "target": "mimeType_s",
            "operation": "move"
          },
          {
            "source": "parent",
            "target": "parent_s",
            "operation": "move"
          },
          {
            "source": "owner",
            "target": "owner_s",
            "operation": "move"
          },
          {
            "source": "group",
            "target": "group_s",
            "operation": "move"
          }
        ],
        "reservedFieldsMappingAllowed": False,
        "skip": False,
        "label": "field-mapping",
        "type": "field-mapping"
      },
      "restrictToTreeUseHostAndPath": True,
      "f.filteringRootTags": [
        "body",
        "head"
      ],
      "f.userAgentEmail": "",
      "f.timeoutMS": 10000,
      "failFastOnStartLinkFailure": True,
      "startLinks": [
        app.config.get("ASF_MAIL_ARCHIVE_BASE_URL", "http://asfmail.lucidworks.io/mail_files/") + mailing_list["mbox"]
      ],
      "chunkSize": 100,
      "includeRegexes": [],
      "f.obeyRobotsDelay": True,
      "deleteErrorsAfter": -1,
      "f.userAgentName": "Lucidworks-Anda/2.0",
      "retryEmit": True,
      "depth": -1,
      "refreshStartLinks": False,
      "f.maxSizeBytes": 4194304,
      "aliasExpiration": 1
    }
  }
  schedule = None
  if "schedule" in mailing_list:
    details = mailing_list["schedule"]
    schedule = create_schedule(details, config["id"])

  return config, schedule
Esempio n. 2
0
def create_config(project_name, project_label, includes, excludes, schedule,
                  pipeline, stacks):
    stack_links = []
    for stack in stacks:
        stack_links.append("http://stackoverflow.com/questions/tagged/" +
                           stack["tag"])
    config = {
        'id': "stack-{0}".format(project_name),
        "connector": "lucid.web",
        "type": "web",
        'pipeline': pipeline,
        "parserId": "default",
        "properties": {
            "refreshOlderThan":
            -1,
            "f.appendTrailingSlashToLinks":
            False,
            "refreshErrors":
            False,
            "restrictToTreeIgnoredHostPrefixes": ["www."],
            "f.excludeSelectors": [
                "div.related", ".topbar", ".bottom-notice",
                "#hot-network-questions", "#header", ".hero-content", "#footer"
            ],
            "f.selectorFields": [
                ".post-taglist a.post-tag", ".accepted-answer .answercell",
                ".answercell", ".postcell .post-text"
            ],
            "f.includeSelectors": ["#mainbar"],
            "dedupeSaveSignature":
            False,
            "crawlDBType":
            "in-memory",
            "f.discardLinkURLQueries":
            False,
            "f.respectMetaEquivRedirects":
            False,
            "fetchDelayMS":
            1000,
            "refreshAll":
            True,
            "f.defaultMIMEType":
            "application/octet-stream",
            "restrictToTreeAllowSubdomains":
            False,
            "maxItems":
            -1,
            "f.scrapeLinksBeforeFiltering":
            False,
            "dedupe":
            False,
            "f.allowAllCertificates":
            False,
            "collection":
            "lucidfind",  # TODO: don't hardcode
            "forceRefresh":
            False,
            "f.obeyRobots":
            True,
            "fetchDelayMSPerHost":
            True,
            "indexCrawlDBToSolr":
            False,
            "fetchThreads":
            1,
            "restrictToTree":
            False,
            "retainOutlinks":
            True,
            "f.defaultCharSet":
            "UTF-8",
            "emitThreads":
            1,
            "excludeExtensions": [".class", ".bin", ".jar"],
            "diagnosticMode":
            False,
            "delete":
            True,
            "f.userAgentWebAddr":
            "",
            "initial_mapping": {
                "id":
                "FromMap",
                "mappings": [
                    {
                        "source": "project",
                        "target": project_name,
                        "operation": "set"
                    },
                    {
                        "source": "project_label",
                        "target": project_label,
                        "operation": "set"
                    },
                    {
                        "source": "datasource_label",
                        "target": project_label,
                        "operation": "set"
                    },  # we only have one crawler for SO
                    {
                        "source": "fetchedDate",
                        "target": "publishedOnDate",
                        "operation": "copy"
                    },
                    {
                        "source": "isBot",
                        "target": "false",
                        "operation": "set"
                    },
                    {
                        "source": "charSet",
                        "target": "charSet_s",
                        "operation": "move"
                    },
                    {
                        "source": "fetchedDate",
                        "target": "fetchedDate_dt",
                        "operation": "move"
                    },
                    {
                        "source": "lastModified",
                        "target": "lastModified_dt",
                        "operation": "move"
                    },
                    {
                        "source": "signature",
                        "target": "dedupeSignature_s",
                        "operation": "move"
                    },
                    {
                        "source": "contentSignature",
                        "target": "signature_s",
                        "operation": "move"
                    },
                    {
                        "source": "length",
                        "target": "length_l",
                        "operation": "move"
                    },
                    {
                        "source": "mimeType",
                        "target": "mimeType_s",
                        "operation": "move"
                    },
                    {
                        "source": "parent",
                        "target": "parent_s",
                        "operation": "move"
                    },
                    {
                        "source": "owner",
                        "target": "owner_s",
                        "operation": "move"
                    },
                    {
                        "source": "group",
                        "target": "group_s",
                        "operation": "move"
                    }
                ],
                "reservedFieldsMappingAllowed":
                False,
                "skip":
                False,
                "label":
                "field-mapping",
                "type":
                "field-mapping"
            },
            "restrictToTreeUseHostAndPath":
            True,
            "f.filteringRootTags": ["body", "head"],
            "f.userAgentEmail":
            "",
            "f.timeoutMS":
            10000,
            "failFastOnStartLinkFailure":
            True,
            "startLinks":
            stack_links,
            "chunkSize":
            100,
            "includeRegexes": [],
            "f.obeyRobotsDelay":
            True,
            "deleteErrorsAfter":
            -1,
            "f.userAgentName":
            "Lucidworks-Anda/2.0",
            "retryEmit":
            True,
            "depth":
            -1,
            "refreshStartLinks":
            False,
            "f.maxSizeBytes":
            4194304,
            "aliasExpiration":
            1
        }
    }
    if excludes:
        config['properties']['excludeRegexes'] = excludes
    if includes:
        config['properties']['includeRegexes'] = includes
    if schedule:
        details = schedule
        schedule = create_schedule(details, config["id"])
    return config, schedule
Esempio n. 3
0
def create_jira_datasource_config(project):
    """
    Generate the JIRA data source config for a given project

    :param project: the project
    :returns: the configuration dictionary
    """
    pipeline = project["jira_pipeline"]
    jira = project["jira"]
    if "pipeline" in jira:
        pipeline = jira["pipeline"]
    if pipeline is None:
        pipeline = "jira-default"

    config = {"id": "jira-{0}-{1}".format(project["name"], jira["name"]),
              "connector": "lucid.anda",
              "type": "jira",
              "pipeline": pipeline,
              "properties": {
                  "collection": "lucidfind",
                  "startLinks": [jira["url"]]},
                  "initial_mapping": {
                      "id": "FromMap",
                      "mappings": [
                          {"source": "project", "target": project["name"], "operation": "set"},
                          {"source": "project_label", "target": project["label"], "operation": "set"},
                          {"source": "datasource_label", "target": jira["label"], "operation": "set"},
                          {"source": "isBot", "target": "false", "operation": "set"},
                          {
                              "source": "charSet",
                              "target": "charSet_s",
                              "operation": "move"
                          },
                          {
                              "source": "fetchedDate",
                              "target": "fetchedDate_dt",
                              "operation": "move"
                          },
                          {
                              "source": "lastModified",
                              "target": "lastModified_dt",
                              "operation": "move"
                          },
                          {
                              "source": "signature",
                              "target": "dedupeSignature_s",
                              "operation": "move"
                          },
                          {
                              "source": "contentSignature",
                              "target": "signature_s",
                              "operation": "move"
                          },
                          {
                              "source": "length",
                              "target": "length_l",
                              "operation": "move"
                          },
                          {
                              "source": "mimeType",
                              "target": "mimeType_s",
                              "operation": "move"
                          },
                          {
                              "source": "parent",
                              "target": "parent_s",
                              "operation": "move"
                          },
                          {
                              "source": "owner",
                              "target": "owner_s",
                              "operation": "move"
                          },
                          {
                              "source": "group",
                              "target": "group_s",
                              "operation": "move"
                          }
                      ],
                      "reservedFieldsMappingAllowed": False,
                      "skip": False,
                      "label": "field-mapping",
                      "type": "field-mapping"
                  }
              }
    schedule = None
    if "schedule" in jira:
        details = jira["schedule"]
        schedule = create_schedule(details, config["id"])

    if "jira_user" in jira:
        config['properties']["f.jira_username"] = jira["jira_user"],
        config['properties']["f.jira_password"] = app.config.get(jira["jira_pass"]),
    return (config, schedule)
Esempio n. 4
0
def create_config(project_name, project_label, pipeline, mailing_list):
    if "pipeline" in mailing_list:
        pipeline = mailing_list[
            "pipeline"]  # individual mailing lists may override
    config = {
        'id': "mailing-list-{0}-{1}".format(project_name,
                                            mailing_list["name"]),
        "connector": "lucid.anda",
        "type": "web",
        'pipeline': pipeline,
        "properties": {
            "refreshOlderThan":
            -1,
            "f.appendTrailingSlashToLinks":
            False,
            "refreshErrors":
            False,
            "restrictToTreeIgnoredHostPrefixes": ["www."],
            "dedupeSaveSignature":
            False,
            "crawlDBType":
            "in-memory",
            "f.discardLinkURLQueries":
            False,
            "f.respectMetaEquivRedirects":
            False,
            "fetchDelayMS":
            0,
            "splitArchives":
            True,
            "refreshAll":
            True,
            "f.defaultMIMEType":
            "application/octet-stream",
            "restrictToTreeAllowSubdomains":
            False,
            "maxItems":
            -1,
            "f.scrapeLinksBeforeFiltering":
            False,
            "dedupe":
            False,
            "f.allowAllCertificates":
            False,
            "collection":
            "lucidfind",
            "forceRefresh":
            False,
            "f.obeyRobots":
            True,
            "fetchDelayMSPerHost":
            True,
            "indexCrawlDBToSolr":
            False,
            "fetchThreads":
            5,
            "restrictToTree":
            True,
            "retainOutlinks":
            True,
            "f.defaultCharSet":
            "UTF-8",
            "emitThreads":
            5,
            "diagnosticMode":
            False,
            "delete":
            True,
            "f.userAgentWebAddr":
            "",
            "initial_mapping": {
                "id":
                "FromMap",
                "mappings": [{
                    "source": "project",
                    "target": project_name,
                    "operation": "set"
                }, {
                    "source": "project_label",
                    "target": project_label,
                    "operation": "set"
                }, {
                    "source": "datasource_label",
                    "target": mailing_list["label"],
                    "operation": "set"
                }, {
                    "source": "fetchedDate",
                    "target": "publishedOnDate",
                    "operation": "copy"
                }, {
                    "source": "charSet",
                    "target": "charSet_s",
                    "operation": "move"
                }, {
                    "source": "fetchedDate",
                    "target": "fetchedDate_dt",
                    "operation": "move"
                }, {
                    "source": "lastModified",
                    "target": "lastModified_dt",
                    "operation": "move"
                }, {
                    "source": "signature",
                    "target": "dedupeSignature_s",
                    "operation": "move"
                }, {
                    "source": "contentSignature",
                    "target": "signature_s",
                    "operation": "move"
                }, {
                    "source": "length",
                    "target": "length_l",
                    "operation": "move"
                }, {
                    "source": "mimeType",
                    "target": "mimeType_s",
                    "operation": "move"
                }, {
                    "source": "parent",
                    "target": "parent_s",
                    "operation": "move"
                }, {
                    "source": "owner",
                    "target": "owner_s",
                    "operation": "move"
                }, {
                    "source": "group",
                    "target": "group_s",
                    "operation": "move"
                }],
                "reservedFieldsMappingAllowed":
                False,
                "skip":
                False,
                "label":
                "field-mapping",
                "type":
                "field-mapping"
            },
            "restrictToTreeUseHostAndPath":
            True,
            "f.filteringRootTags": ["body", "head"],
            "f.userAgentEmail":
            "",
            "f.timeoutMS":
            10000,
            "failFastOnStartLinkFailure":
            True,
            "startLinks": [
                "http://asfmail.lucidworks.io/mail_files/" +
                mailing_list["mbox"]
            ],
            "chunkSize":
            50,
            "includeRegexes": [],
            "f.obeyRobotsDelay":
            True,
            "deleteErrorsAfter":
            -1,
            "f.userAgentName":
            "Lucidworks-Anda/2.0",
            "retryEmit":
            True,
            "depth":
            -1,
            "refreshStartLinks":
            False,
            "f.maxSizeBytes":
            4194304,
            "aliasExpiration":
            1
        }
    }
    schedule = None
    if "schedule" in mailing_list:
        details = mailing_list["schedule"]
        schedule = create_schedule(details, config["id"])

    return config, schedule
def create_config(project_name, project_label, pipeline, website):
  if "pipeline" in website:
    pipeline = website["pipeline"]  # individual mailing lists may override
  config = {
    'id': "website-{0}-{1}".format(project_name, website["name"]),
    "connector": "lucid.web",
    "type": "web",
    'pipeline': pipeline,
    "parserId": "default",
    "properties": {
      "refreshOlderThan": -1,
      "f.appendTrailingSlashToLinks": False,
      "refreshErrors": False,
      "restrictToTreeIgnoredHostPrefixes": [
        "www."
      ],
      "dedupeSaveSignature": False,
      "crawlDBType": "in-memory",
      "f.discardLinkURLQueries": False,
      "f.respectMetaEquivRedirects": False,
      "fetchDelayMS": 50,
      "refreshAll": True,
      "f.defaultMIMEType": "application/octet-stream",
      "restrictToTreeAllowSubdomains": False,
      "maxItems": -1,
      "f.scrapeLinksBeforeFiltering": False,
      "dedupe": False,
      "f.allowAllCertificates": False,
      "collection": "lucidfind", #TODO: don't hardcode
      "forceRefresh": False,
      "f.obeyRobots": True,
      "fetchDelayMSPerHost": True,
      "indexCrawlDBToSolr": False,
      "fetchThreads": 1,
      "restrictToTree": True,
      "retainOutlinks": True,
      "f.defaultCharSet": "UTF-8",
      "emitThreads": 1,
      "excludeExtensions": [
        ".class",
        ".bin",
        ".jar"
      ],
      "diagnosticMode": False,
      "delete": True,
      "f.userAgentWebAddr": "",
      "initial_mapping": {
        "id": "FromMap",
        "mappings": [
          {"source": "project", "target": project_name, "operation": "set"},
          {"source": "project_label", "target": project_label, "operation": "set"},
          {"source": "datasource_label", "target": website["label"], "operation": "set"},
          {"source": "fetchedDate", "target": "publishedOnDate", "operation": "copy"},
          {"source": "isBot", "target": "false", "operation": "set"},
          {
            "source": "charSet",
            "target": "charSet_s",
            "operation": "move"
          },
          {
            "source": "fetchedDate",
            "target": "fetchedDate_dt",
            "operation": "move"
          },
          {
            "source": "lastModified",
            "target": "lastModified_dt",
            "operation": "move"
          },
          {
            "source": "signature",
            "target": "dedupeSignature_s",
            "operation": "move"
          },
          {
            "source": "contentSignature",
            "target": "signature_s",
            "operation": "move"
          },
          {
            "source": "length",
            "target": "length_l",
            "operation": "move"
          },
          {
            "source": "mimeType",
            "target": "mimeType_s",
            "operation": "move"
          },
          {
            "source": "parent",
            "target": "parent_s",
            "operation": "move"
          },
          {
            "source": "owner",
            "target": "owner_s",
            "operation": "move"
          },
          {
            "source": "group",
            "target": "group_s",
            "operation": "move"
          }
        ],
        "reservedFieldsMappingAllowed": False,
        "skip": False,
        "label": "field-mapping",
        "type": "field-mapping"
      },
      "restrictToTreeUseHostAndPath": True,
      "f.filteringRootTags": [
        "body",
        "head"
      ],
      "f.userAgentEmail": "",
      "f.timeoutMS": 10000,
      "failFastOnStartLinkFailure": True,
      "startLinks": [
        website["url"]
      ],
      "chunkSize": 100,
      "includeRegexes": [],
      "f.obeyRobotsDelay": True,
      "deleteErrorsAfter": -1,
      "f.userAgentName": "Lucidworks-Anda/2.0",
      "retryEmit": True,
      "depth": -1,
      "refreshStartLinks": False,
      "f.maxSizeBytes": 4194304,
      "aliasExpiration": 1
    }
  }
  if "excludes" in website:
    config['properties']['excludeRegexes'] = website["excludes"]
  if "includeRegexes" in website:
    config['properties']['includeRegexes'] = website["includeRegexes"]
  if "includeTags" in website:
    config['properties']['f.includeTags'] = website["includeTags"]
  if "excludeTags" in website:
    config['properties']['f.excludeTags'] = website["excludeTags"]
  if "includeTagIDs" in website:
    config['properties']['f.includeTagIDs'] = website["includeTagIDs"]
  if "excludeTagIDs" in website:
    config['properties']['f.excludeTagIDs'] = website["excludeTagIDs"]
  if "excludeTagClasses" in website:
    config['properties']['f.excludeTagClasses'] = website["excludeTagClasses"]
  if "scrapeLinksBeforeFiltering" in website:
    config['properties']['f.scrapeLinksBeforeFiltering'] = website["scrapeLinksBeforeFiltering"]
  if "restrictToTreeUseHostAndPath" in website:
    config['properties']['restrictToTreeUseHostAndPath'] = website["restrictToTreeUseHostAndPath"]
  if "multiurl" in website:
    config['properties']['startLinks'] = website["multiurl"]
  if "additional_mapping" in website:
    config['properties']['initial_mapping']['mappings'].append(website["additional_mapping"])
  if "additional_mapping_2" in website:
    config['properties']['initial_mapping']['mappings'].append(website["additional_mapping_2"])


  schedule = None
  if "schedule" in website:
    details = website["schedule"]
    schedule = create_schedule(details, config["id"])
  return config, schedule
Esempio n. 6
0
def create_config(project_name, project_label, pipeline, repo):
    if "pipeline" in repo:
        pipeline = repo["pipeline"]  # individual mailing lists may override
    config = {
        "id": "github-{0}-{1}".format(project_name, repo["name"]),
        "connector": "lucid.anda",
        "type": "github",
        "pipeline": pipeline,
        "properties": {
            "collection": "lucidfind",  # TODO: don't hardcode
            "startLinks": [repo["url"]],
            "f.blobs": repo.get("blobs", True),
            "f.branches": repo.get("branches", True),
            "f.commits": repo.get("commits", False),
            "f.issues": repo.get("issues", False),
            "f.pull_requests": repo.get("pull_requests", False),
            "f.pull_request_comments": repo.get("pull_request_comments",
                                                False),
            "f.milestones": repo.get("milestones", False),
            "f.commit_diffs": repo.get("commit_diffs", False),
            "f.releases": repo.get("releases", False),
            "fetchThreads": 1,
            "initial_mapping": {
                "mappings": [{
                    "source": "project",
                    "target": project_name,
                    "operation": "set"
                }, {
                    "source": "project_label",
                    "target": project_label,
                    "operation": "set"
                }, {
                    "source": "datasource_label",
                    "target": repo["label"],
                    "operation": "set"
                }, {
                    "source": "isBot",
                    "target": "false",
                    "operation": "set"
                }, {
                    "source": "charSet",
                    "target": "charSet_s",
                    "operation": "move"
                }, {
                    "source": "fetchedDate",
                    "target": "fetchedDate_dt",
                    "operation": "move"
                }, {
                    "source": "lastModified",
                    "target": "lastModified_dt",
                    "operation": "move"
                }, {
                    "source": "signature",
                    "target": "dedupeSignature_s",
                    "operation": "move"
                }, {
                    "source": "contentSignature",
                    "target": "signature_s",
                    "operation": "move"
                }, {
                    "source": "length",
                    "target": "length_l",
                    "operation": "move"
                }, {
                    "source": "mimeType",
                    "target": "mimeType_s",
                    "operation": "move"
                }, {
                    "source": "parent",
                    "target": "parent_s",
                    "operation": "move"
                }, {
                    "source": "owner",
                    "target": "owner_s",
                    "operation": "move"
                }, {
                    "source": "group",
                    "target": "group_s",
                    "operation": "move"
                }],
                "reservedFieldsMappingAllowed":
                False,
                "skip":
                False,
                "id":
                "Anda",
                "label":
                "field-mapping",
                "type":
                "field-mapping"
            }
        }
    }

    if "github_user" in repo:
        config['properties']["f.github_username"] = repo["github_user"]
        config['properties']["f.github_password"] = app.config.get(
            repo["github_pass"])  # TODO: encrypt

    if "includes" in repo:
        config['properties']['includeRegexes'] = [repo["includes"]]

    if "excludes" in repo:
        config['properties']['excludeRegexes'] = [repo["excludes"]]
    schedule = None
    if "schedule" in repo:
        details = repo["schedule"]
        schedule = create_schedule(details, config["id"])
    return config, schedule
Esempio n. 7
0
def create_config(project_name, project_label, pipeline, repo):
    if "pipeline" in repo:
        pipeline = repo["pipeline"]  # individual mailing lists may override
    config = {"id": "github-{0}-{1}".format(project_name, repo["name"]),
              "connector": "lucid.anda",
              "type": "github",
              "pipeline": pipeline,
              "properties": {
                  "collection": "lucidfind",  # TODO: don't hardcode
                  "startLinks": [repo["url"]],
                  "f.blobs": repo.get("blobs", True),
                  "f.branches": repo.get("branches", True),
                  "f.commits": repo.get("commits", False),
                  "f.issues": repo.get("issues", False),
                  "f.pull_requests": repo.get("pull_requests", False),
                  "f.pull_request_comments": repo.get("pull_request_comments", False),
                  "f.milestones": repo.get("milestones", False),
                  "f.commit_diffs": repo.get("commit_diffs", False),
                  "f.releases": repo.get("releases", False),
                  "fetchThreads": 1,
                  "initial_mapping": {
                      "mappings": [
                          {"source": "project", "target": project_name, "operation": "set"},
                          {"source": "project_label", "target": project_label, "operation": "set"},
                          {"source": "datasource_label", "target": repo["label"], "operation": "set"},
                          {
                              "source": "charSet",
                              "target": "charSet_s",
                              "operation": "move"
                          },
                          {
                              "source": "fetchedDate",
                              "target": "fetchedDate_dt",
                              "operation": "move"
                          },
                          {
                              "source": "lastModified",
                              "target": "lastModified_dt",
                              "operation": "move"
                          },
                          {
                              "source": "signature",
                              "target": "dedupeSignature_s",
                              "operation": "move"
                          },
                          {
                              "source": "contentSignature",
                              "target": "signature_s",
                              "operation": "move"
                          },
                          {
                              "source": "length",
                              "target": "length_l",
                              "operation": "move"
                          },
                          {
                              "source": "mimeType",
                              "target": "mimeType_s",
                              "operation": "move"
                          },
                          {
                              "source": "parent",
                              "target": "parent_s",
                              "operation": "move"
                          },
                          {
                              "source": "owner",
                              "target": "owner_s",
                              "operation": "move"
                          },
                          {
                              "source": "group",
                              "target": "group_s",
                              "operation": "move"
                          }
                      ],
                      "reservedFieldsMappingAllowed": False,
                      "skip": False,
                      "id": "Anda",
                      "label": "field-mapping",
                      "type": "field-mapping"
                  }
              }
              }

    if "github_user" in repo:
        config['properties']["f.github_username"] = repo["github_user"]
        config['properties']["f.github_password"] = app.config.get(repo["github_pass"])  # TODO: encrypt

    if "includes" in repo:
        config['properties']['includeRegexes'] = [repo["includes"]]

    if "excludes" in repo:
        config['properties']['excludeRegexes'] = [repo["excludes"]]
    schedule = None
    if "schedule" in repo:
        details = repo["schedule"]
        schedule = create_schedule(details, config["id"])
    return config, schedule
Esempio n. 8
0
def create_jira_datasource_config(project):
    """
    Generate the JIRA data source config for a given project

    :param project: the project
    :returns: the configuration dictionary
    """
    pipeline = project["jira_pipeline"]
    jira = project["jira"]
    if "pipeline" in jira:
        pipeline = jira["pipeline"]
    if pipeline is None:
        pipeline = "jira-default"

    config = {
        "id": "jira-{0}-{1}".format(project["name"], jira["name"]),
        "connector": "lucid.anda",
        "type": "jira",
        "pipeline": pipeline,
        "properties": {
            "collection": "lucidfind",
            "startLinks": [jira["url"]]
        },
        "initial_mapping": {
            "id":
            "FromMap",
            "mappings": [{
                "source": "project",
                "target": project["name"],
                "operation": "set"
            }, {
                "source": "project_label",
                "target": project["label"],
                "operation": "set"
            }, {
                "source": "datasource_label",
                "target": jira["label"],
                "operation": "set"
            }, {
                "source": "isBot",
                "target": "false",
                "operation": "set"
            }, {
                "source": "charSet",
                "target": "charSet_s",
                "operation": "move"
            }, {
                "source": "fetchedDate",
                "target": "fetchedDate_dt",
                "operation": "move"
            }, {
                "source": "lastModified",
                "target": "lastModified_dt",
                "operation": "move"
            }, {
                "source": "signature",
                "target": "dedupeSignature_s",
                "operation": "move"
            }, {
                "source": "contentSignature",
                "target": "signature_s",
                "operation": "move"
            }, {
                "source": "length",
                "target": "length_l",
                "operation": "move"
            }, {
                "source": "mimeType",
                "target": "mimeType_s",
                "operation": "move"
            }, {
                "source": "parent",
                "target": "parent_s",
                "operation": "move"
            }, {
                "source": "owner",
                "target": "owner_s",
                "operation": "move"
            }, {
                "source": "group",
                "target": "group_s",
                "operation": "move"
            }],
            "reservedFieldsMappingAllowed":
            False,
            "skip":
            False,
            "label":
            "field-mapping",
            "type":
            "field-mapping"
        }
    }
    schedule = None
    if "schedule" in jira:
        details = jira["schedule"]
        schedule = create_schedule(details, config["id"])

    if "jira_user" in jira:
        config['properties']["f.jira_username"] = jira["jira_user"],
        config['properties']["f.jira_password"] = app.config.get(
            jira["jira_pass"]),
    return (config, schedule)
Esempio n. 9
0
def create_config(project_name, project_label, pipeline, mailing_list):
  if "pipeline" in mailing_list:
    pipeline = mailing_list["pipeline"]  # individual mailing lists may override
  config = {
    'id': "mailing-list-{0}-{1}".format(project_name, mailing_list["name"]),
    "connector": "lucid.web",
    "type": "web",
    'pipeline': pipeline,
    #"parserId": "default",
    "properties": {
      "refreshOlderThan": -1,
      "f.appendTrailingSlashToLinks": False,
      "refreshErrors": False,
      "restrictToTreeIgnoredHostPrefixes": [
        "www."
      ],
      "dedupeSaveSignature": False,
      "crawlDBType": "in-memory",
      "f.discardLinkURLQueries": False,
      "f.respectMetaEquivRedirects": False,
      "fetchDelayMS": 50,
      #"splitArchives": True,
      "refreshAll": True,
      #"refreshScript": "function shouldRefresh(id, depth, lastModified, lastFetched, lastEmitted, error){\n\tvar date = new Date();\n  var month = \"\";\n  if (date.getUTCMonth() < 10){\n    month = \"0\" + (date.getUTCMonth() + 1);\n  } else {\n    month = date.getUTCMonth()  + 1\n  }\n  var yearMonth = date.getUTCFullYear() + \"\" + (month);//need 1 based dates\n  if (id != null){\n    return id.indexOf(yearMonth) != -1\n  }\n  return false;\n}\n",
      "f.defaultMIMEType": "application/octet-stream",
      "restrictToTreeAllowSubdomains": False,
      "maxItems": -1,
      "f.scrapeLinksBeforeFiltering": False,
      "dedupe": False,
      "f.allowAllCertificates": False,
      "collection": "lucidfind",
      "forceRefresh": False,
      "f.obeyRobots": True,
      "fetchDelayMSPerHost": True,
      "indexCrawlDBToSolr": False,
      "fetchThreads": 1,
      "restrictToTree": True,
      "retainOutlinks": True,
      "f.defaultCharSet": "UTF-8",
      "emitThreads": 1,
      "diagnosticMode": False,
      "delete": True,
      "f.userAgentWebAddr": "",
      "initial_mapping": {
        "id": "FromMap",
        "mappings": [
          {"source": "project", "target": project_name, "operation": "set"},
          {"source": "project_label", "target": project_label, "operation": "set"},
          {"source": "datasource_label", "target": mailing_list["label"], "operation": "set"},
          {"source": "fetchedDate", "target": "publishedOnDate", "operation": "copy"},
          {
            "source": "charSet",
            "target": "charSet_s",
            "operation": "move"
          },
          {
            "source": "fetchedDate",
            "target": "fetchedDate_dt",
            "operation": "move"
          },
          {
            "source": "lastModified",
            "target": "lastModified_dt",
            "operation": "move"
          },
          {
            "source": "signature",
            "target": "dedupeSignature_s",
            "operation": "move"
          },
          {
            "source": "contentSignature",
            "target": "signature_s",
            "operation": "move"
          },
          {
            "source": "length",
            "target": "length_l",
            "operation": "move"
          },
          {
            "source": "mimeType",
            "target": "mimeType_s",
            "operation": "move"
          },
          {
            "source": "parent",
            "target": "parent_s",
            "operation": "move"
          },
          {
            "source": "owner",
            "target": "owner_s",
            "operation": "move"
          },
          {
            "source": "group",
            "target": "group_s",
            "operation": "move"
          }
        ],
        "reservedFieldsMappingAllowed": False,
        "skip": False,
        "label": "field-mapping",
        "type": "field-mapping"
      },
      "restrictToTreeUseHostAndPath": True,
      "f.filteringRootTags": [
        "body",
        "head"
      ],
      "f.userAgentEmail": "",
      "f.timeoutMS": 10000,
      "failFastOnStartLinkFailure": True,
      "startLinks": [
        app.config.get("ASF_MAIL_ARCHIVE_BASE_URL", "http://asfmail.lucidworks.io/mail_files/") + mailing_list["mbox"]
      ],
      "chunkSize": 100,
      "includeRegexes": [],
      "f.obeyRobotsDelay": True,
      "deleteErrorsAfter": -1,
      "f.userAgentName": "Lucidworks-Anda/2.0",
      "retryEmit": True,
      "depth": -1,
      "refreshStartLinks": False,
      "f.maxSizeBytes": 4194304,
      "aliasExpiration": 1
    }
  }
  #automatically exclude date/author sorted links and atom:
  # http://asfmail.lucidworks.io/mail_files/ambari-dev/201412.mbox/date and http://asfmail.lucidworks.io/mail_files/ambari-dev/201412.mbox/author
  # http://asfmail.lucidworks.io/mail_files/ambari-dev/?format=atom
  config['properties']['excludeRegexes'] = [".*/date.*", ".*/author.*", ".*format=atom.*"]
  if "excludes" in mailing_list:
    config['properties']['excludeRegexes'].append(mailing_list["excludes"])
  schedule = None
  if "schedule" in mailing_list:
    details = mailing_list["schedule"]
    schedule = create_schedule(details, config["id"])

  return config, schedule
Esempio n. 10
0
def create_config(project_name, project_label, includes, excludes, schedule, pipeline, stacks):
    stack_links = []
    for stack in stacks:
        stack_links.append("http://stackoverflow.com/questions/tagged/" + stack["tag"])
    config = {
        'id': "stack-{0}".format(project_name),
        "connector": "lucid.web",
        "type": "web",
        'pipeline': pipeline,
        "parserId": "default",
        "properties": {
            "refreshOlderThan": -1,
            "f.appendTrailingSlashToLinks": False,
            "refreshErrors": False,
            "restrictToTreeIgnoredHostPrefixes": [
                "www."
            ],
            "f.excludeSelectors": [
                "div.related",
                ".topbar",
                ".bottom-notice",
                "#hot-network-questions",
                "#header",
                ".hero-content",
                "#footer"
            ],
            "f.selectorFields": [
                ".post-taglist a.post-tag", ".accepted-answer .answercell",
                ".answercell",
                ".postcell .post-text"],
            "f.includeSelectors": [
                "#mainbar"
            ],
            "dedupeSaveSignature": False,
            "crawlDBType": "in-memory",
            "f.discardLinkURLQueries": False,
            "f.respectMetaEquivRedirects": False,
            "fetchDelayMS": 1000,
            "refreshAll": True,
            "f.defaultMIMEType": "application/octet-stream",
            "restrictToTreeAllowSubdomains": False,
            "maxItems": -1,
            "f.scrapeLinksBeforeFiltering": False,
            "dedupe": False,
            "f.allowAllCertificates": False,
            "collection": "lucidfind",  # TODO: don't hardcode
            "forceRefresh": False,
            "f.obeyRobots": True,
            "fetchDelayMSPerHost": True,
            "indexCrawlDBToSolr": False,
            "fetchThreads": 1,
            "restrictToTree": False,
            "retainOutlinks": True,
            "f.defaultCharSet": "UTF-8",
            "emitThreads": 1,
            "excludeExtensions": [
                ".class",
                ".bin",
                ".jar"
            ],
            "diagnosticMode": False,
            "delete": True,
            "f.userAgentWebAddr": "",
            "initial_mapping": {
                "id": "FromMap",
                "mappings": [
                    {"source": "project", "target": project_name, "operation": "set"},
                    {"source": "project_label", "target": project_label, "operation": "set"},
                    {"source": "datasource_label", "target": project_label, "operation": "set"}, # we only have one crawler for SO
                    {"source": "fetchedDate", "target": "publishedOnDate", "operation": "copy"},
                    {"source": "isBot", "target": "false", "operation": "set"},
                    {
                        "source": "charSet",
                        "target": "charSet_s",
                        "operation": "move"
                    },
                    {
                        "source": "fetchedDate",
                        "target": "fetchedDate_dt",
                        "operation": "move"
                    },
                    {
                        "source": "lastModified",
                        "target": "lastModified_dt",
                        "operation": "move"
                    },
                    {
                        "source": "signature",
                        "target": "dedupeSignature_s",
                        "operation": "move"
                    },
                    {
                        "source": "contentSignature",
                        "target": "signature_s",
                        "operation": "move"
                    },
                    {
                        "source": "length",
                        "target": "length_l",
                        "operation": "move"
                    },
                    {
                        "source": "mimeType",
                        "target": "mimeType_s",
                        "operation": "move"
                    },
                    {
                        "source": "parent",
                        "target": "parent_s",
                        "operation": "move"
                    },
                    {
                        "source": "owner",
                        "target": "owner_s",
                        "operation": "move"
                    },
                    {
                        "source": "group",
                        "target": "group_s",
                        "operation": "move"
                    }
                ],
                "reservedFieldsMappingAllowed": False,
                "skip": False,
                "label": "field-mapping",
                "type": "field-mapping"
            },
            "restrictToTreeUseHostAndPath": True,
            "f.filteringRootTags": [
                "body",
                "head"
            ],
            "f.userAgentEmail": "",
            "f.timeoutMS": 10000,
            "failFastOnStartLinkFailure": True,
            "startLinks": stack_links,
            "chunkSize": 100,
            "includeRegexes": [],
            "f.obeyRobotsDelay": True,
            "deleteErrorsAfter": -1,
            "f.userAgentName": "Lucidworks-Anda/2.0",
            "retryEmit": True,
            "depth": -1,
            "refreshStartLinks": False,
            "f.maxSizeBytes": 4194304,
            "aliasExpiration": 1
        }
    }
    if excludes:
        config['properties']['excludeRegexes'] = excludes
    if includes:
        config['properties']['includeRegexes'] = includes
    if schedule:
        details = schedule
        schedule = create_schedule(details, config["id"])
    return config, schedule
Esempio n. 11
0
def create_config(project_name, project_label, pipeline, mailing_list):
  if "pipeline" in mailing_list:
    pipeline = mailing_list["pipeline"]  # individual mailing lists may override
  config = {
    'id': "mailing-list-{0}-{1}".format(project_name, mailing_list["name"]),
    "connector": "lucid.anda",
    "type": "web",
    'pipeline': pipeline,
    #"parserId": "default",
    "properties": {
      "refreshOlderThan": -1,
      "f.appendTrailingSlashToLinks": False,
      "refreshErrors": False,
      "restrictToTreeIgnoredHostPrefixes": [
        "www."
      ],
      "dedupeSaveSignature": False,
      "crawlDBType": "in-memory",
      "f.discardLinkURLQueries": False,
      "f.respectMetaEquivRedirects": False,
      "fetchDelayMS": 50,
      #"splitArchives": True,
      "refreshAll": True,
      #"refreshScript": "function shouldRefresh(id, depth, lastModified, lastFetched, lastEmitted, error){\n\tvar date = new Date();\n  var month = \"\";\n  if (date.getUTCMonth() < 10){\n    month = \"0\" + (date.getUTCMonth() + 1);\n  } else {\n    month = date.getUTCMonth()  + 1\n  }\n  var yearMonth = date.getUTCFullYear() + \"\" + (month);//need 1 based dates\n  if (id != null){\n    return id.indexOf(yearMonth) != -1\n  }\n  return false;\n}\n",
      "f.defaultMIMEType": "application/octet-stream",
      "restrictToTreeAllowSubdomains": False,
      "maxItems": -1,
      "f.scrapeLinksBeforeFiltering": False,
      "dedupe": False,
      "f.allowAllCertificates": False,
      "collection": "lucidfind",
      "forceRefresh": False,
      "f.obeyRobots": True,
      "fetchDelayMSPerHost": True,
      "indexCrawlDBToSolr": False,
      "fetchThreads": 1,
      "restrictToTree": True,
      "retainOutlinks": True,
      "f.defaultCharSet": "UTF-8",
      "emitThreads": 1,
      "diagnosticMode": False,
      "delete": True,
      "f.userAgentWebAddr": "",
      "initial_mapping": {
        "id": "FromMap",
        "mappings": [
          {"source": "project", "target": project_name, "operation": "set"},
          {"source": "project_label", "target": project_label, "operation": "set"},
          {"source": "datasource_label", "target": mailing_list["label"], "operation": "set"},
          {"source": "fetchedDate", "target": "publishedOnDate", "operation": "copy"},
          {
            "source": "charSet",
            "target": "charSet_s",
            "operation": "move"
          },
          {
            "source": "fetchedDate",
            "target": "fetchedDate_dt",
            "operation": "move"
          },
          {
            "source": "lastModified",
            "target": "lastModified_dt",
            "operation": "move"
          },
          {
            "source": "signature",
            "target": "dedupeSignature_s",
            "operation": "move"
          },
          {
            "source": "contentSignature",
            "target": "signature_s",
            "operation": "move"
          },
          {
            "source": "length",
            "target": "length_l",
            "operation": "move"
          },
          {
            "source": "mimeType",
            "target": "mimeType_s",
            "operation": "move"
          },
          {
            "source": "parent",
            "target": "parent_s",
            "operation": "move"
          },
          {
            "source": "owner",
            "target": "owner_s",
            "operation": "move"
          },
          {
            "source": "group",
            "target": "group_s",
            "operation": "move"
          }
        ],
        "reservedFieldsMappingAllowed": False,
        "skip": False,
        "label": "field-mapping",
        "type": "field-mapping"
      },
      "restrictToTreeUseHostAndPath": True,
      "f.filteringRootTags": [
        "body",
        "head"
      ],
      "f.userAgentEmail": "",
      "f.timeoutMS": 10000,
      "failFastOnStartLinkFailure": True,
      "startLinks": [
        app.config.get("ASF_MAIL_ARCHIVE_BASE_URL", "http://asfmail.lucidworks.io/mail_files/") + mailing_list["mbox"]
      ],
      "chunkSize": 100,
      "includeRegexes": [],
      "f.obeyRobotsDelay": True,
      "deleteErrorsAfter": -1,
      "f.userAgentName": "Lucidworks-Anda/2.0",
      "retryEmit": True,
      "depth": -1,
      "refreshStartLinks": False,
      "f.maxSizeBytes": 4194304,
      "aliasExpiration": 1
    }
  }
  #automatically exclude date/author sorted links and atom:
  # http://asfmail.lucidworks.io/mail_files/ambari-dev/201412.mbox/date and http://asfmail.lucidworks.io/mail_files/ambari-dev/201412.mbox/author
  # http://asfmail.lucidworks.io/mail_files/ambari-dev/?format=atom
  config['properties']['excludeRegexes'] = [".*/date.*", ".*/author.*", ".*format=atom.*"]
  if "excludes" in mailing_list:
    config['properties']['excludeRegexes'].append(mailing_list["excludes"])
  schedule = None
  if "schedule" in mailing_list:
    details = mailing_list["schedule"]
    schedule = create_schedule(details, config["id"])

  return config, schedule