Ejemplo n.º 1
0
# See the License for the specific language governing permissions and
# limitations under the License.
"""Get profile picture from Twitter profiles"""

import json
import requests
import urllib

import sling
import sling.flags as flags
import sling.log as log
import sling.task.data as data
from sling.task.workflow import *

flags.define("--twitterdb",
             help="database for storing Twitter profiles",
             default="http://localhost:7070/twitter",
             metavar="DBURL")

bad_images = set(
    ["http://pbs.twimg.com/profile_images/1302121919014207490/KaYYEC8b.jpg"])


# Task for extracting images from Twitter profiles.
class TwitterExtract:
    def run(self, task):
        # Get parameters.
        twitterdb = task.input("twitterdb").name

        # Load knowledge base.
        log.info("Load knowledge base")
        kb = sling.Store()
Ejemplo n.º 2
0
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run SLING command"""

import importlib
import subprocess
import sys
import time

import sling
import sling.flags as flags
import sling.log as log
import sling.task.workflow as workflow

# Command-line flags.
flags.define("COMMAND", help="commands(s) to perform", default=[], nargs="*")

flags.define("-l",
             "--list",
             help="list commands",
             default=False,
             action="store_true")

flags.define("--spawn",
             help="run command in background",
             default=False,
             action="store_true")

flags.define("--version",
             help="print version information",
             default=False,
Ejemplo n.º 3
0
# limitations under the License.
"""Run SLING processing"""

import sling
import sling.flags as flags
import sling.log as log
import sling.task.corpora as corpora
import sling.task.download as download
import sling.task.wiki as wiki
import sling.task.embedding as embedding
import sling.task.entity as entity
import sling.task.workflow as workflow

# Command-line flags.
flags.define("--download_wikidata",
             help="download wikidata dump",
             default=False,
             action='store_true')

flags.define("--download_wikipedia",
             help="download wikipedia dump(s)",
             default=False,
             action='store_true')

flags.define("--import_wikidata",
             help="convert wikidata to sling format",
             default=False,
             action='store_true')

flags.define("--import_wikipedia",
             help="convert wikidata dump(s) to sling format",
             default=False,
Ejemplo n.º 4
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Numerical gradient checking."""

import sling
import sling.myelin as myelin
import sling.flags as flags
import numpy as np
import math

flags.define("--fp64", default=False, action='store_true')
flags.define("--seed", default=None)
flags.define("--test")
flags.define("--dump_data", default=False, action='store_true')

flags.parse()
compiler = myelin.Compiler()
if flags.arg.seed: np.random.seed(int(flags.arg.seed))

shape = [16]
dtype = myelin.DT_FLOAT
nptype = np.float32
if flags.arg.fp64:
  dtype = "float64"
  nptype = np.float64
Ejemplo n.º 5
0
"""Fetch media files and store in media cache database."""

import email.utils
import datetime
import hashlib
import requests
import sys
import traceback
import urllib

import sling
import sling.flags as flags

flags.define("--kb",
             default="data/e/kb/kb.sling",
             help="Knowledge base with media references")

flags.define("--mediadb",
             default="http://localhost:7070/media",
             help="Media database")

flags.define("--max_media_size",
             help="Maximum media file size",
             default=63*1024*1024,
             type=int,
             metavar="SIZE")

flags.define("--blacklist",
             default="local/media-blacklist.txt",
             help="List of blacklisted media files")
Ejemplo n.º 6
0
#
#     http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Workflow builder for Wikidata and Wikipedia processing"""

from workflow import *
import corpora
import sling.flags as flags

flags.define("--index",
             help="index wiki data sets",
             default=False,
             action='store_true')

flags.define("--only_primary_language",
             help="only use wikidata labels from primary language",
             default=False,
             action='store_true')

flags.define("--only_known_languages",
             help="only use wikidata labels from known languages",
             default=False,
             action='store_true')

flags.define("--snapshot_kb",
             help="create snapshot for knowledge base",
             default=False,
Ejemplo n.º 7
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Numerical gradient checking."""

import sling
import sling.myelin as myelin
import sling.flags as flags
import numpy as np
import math

flags.define("--fp64", default=False, action='store_true')

flags.parse()
compiler = myelin.Compiler()

shape = [16]
dtype = myelin.DT_FLOAT
nptype = np.float32
if flags.arg.fp64:
  dtype = "float64"
  nptype = np.float64

# Compute number of elements in shape.
def elements(shape):
  n = 1
  for d in shape: n *= d
Ejemplo n.º 8
0
Archivo: run.py Proyecto: savkov/sling
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run SLING processing"""

import sling
import sling.flags as flags
import sling.log as log
import sling.task.corpora as corpora
import sling.task.download as download
import sling.task.wiki as wiki
import sling.task.embedding as embedding
import sling.task.workflow as workflow

# Command-line flags.
flags.define("--download_wikidata",
             help="download wikidata dump",
             default=False,
             action='store_true')

flags.define("--download_wikipedia",
             help="download wikipedia dump(s)",
             default=False,
             action='store_true')

flags.define("--import_wikidata",
             help="convert wikidata to sling format",
             default=False,
             action='store_true')

flags.define("--import_wikipedia",
             help="convert wikidata dump(s) to sling format",
             default=False,
Ejemplo n.º 9
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Class defining a dashboard of the status of Sling updates to WikiData."""

import pywikibot
import sling
import sling.flags as flags
import glob

flags.define("--test",
             help="use test record file",
             default=False,
             action='store_true')

precision_map = {
  sling.MILLENNIUM: pywikibot.WbTime.PRECISION['millenia'],
  sling.CENTURY: pywikibot.WbTime.PRECISION['century'],
  sling.DECADE: pywikibot.WbTime.PRECISION['decade'],
  sling.YEAR: pywikibot.WbTime.PRECISION['year'],
  sling.MONTH: pywikibot.WbTime.PRECISION['month'],
  sling.DAY: pywikibot.WbTime.PRECISION['day']
}

class WikiMonitor:
  def __init__(self):
    self.site = pywikibot.Site("wikidata", "wikidata")
    self.repo = self.site.data_repository()
Ejemplo n.º 10
0
                        break
                    if other_qid != qid and other_qid not in seen:
                        seen.add(other_qid)
                        self._text(other_category.name)
                        self._form_anchor(" (= %s)" % other_qid, other_qid)
                        self._text(" (%0.4f)" % other_parse.score)
                        self._br()
                self._end("td")
            self._end("tr")
        self._end("table")


if __name__ == "__main__":
    flags.define("--port",
                 help="port number for the HTTP server",
                 default=8001,
                 type=int,
                 metavar="PORT")
    flags.define(
        "--parses",
        help="Recordio of category parses",
        default="local/data/e/wikicat/parses-with-match-statistics.rec",
        type=str,
        metavar="FILE")
    flags.parse()
    log.info('Reading parses from %s' % flags.arg.parses)
    browser_globals.read(flags.arg.parses)
    server_address = ('', flags.arg.port)
    httpd = HTTPServer(server_address, Browser)
    log.info('Starting HTTP Server on port %d' % flags.arg.port)
    httpd.serve_forever()
Ejemplo n.º 11
0
            break
          if other_qid != qid and other_qid not in seen:
            seen.add(other_qid)
            self._text(other_category.name)
            self._form_anchor(" (= %s)" % other_qid, other_qid)
            self._text(" (%0.4f)" % other_parse.score)
            self._br()
        self._end("td")
      self._end("tr")
    self._end("table")


if __name__ == "__main__":
  flags.define("--port",
               help="port number for the HTTP server",
               default=8001,
               type=int,
               metavar="PORT")
  flags.define("--parses",
               help="Recordio of category parses",
               default="local/data/e/wikicat/parses-with-match-statistics.rec",
               type=str,
               metavar="FILE")
  flags.define("--output",
               help="Output dir where Wikibot recordios will be generated.",
               default="local/data/e/wikicat/",
               type=str,
               metavar="DIR")
  flags.parse()
  log.info('Reading parses from %s' % flags.arg.parses)
  browser_globals.init(flags.arg.parses, flags.arg.output)
Ejemplo n.º 12
0
    def attach_fact_matches(self, input_parses):
        with self.wf.namespace("attach-fact-matches"):
            matcher = self.wf.task("category-parse-fact-matcher")
            self.kb_input(matcher)
            matcher.attach_input("parses", input_parses)
            output = self.wf.resource(
                "parses-with-match-statistics.rec", \
                dir=self.outdir, format="records/frame")
            matcher.attach_output("output", output)
            return output


if __name__ == '__main__':
    flags.define("--port",
                 help="port number for task monitor (0 means no monitor)",
                 default=6767,
                 type=int,
                 metavar="PORT")
    flags.define("--output",
                 help="Output directory",
                 default="data/e/wikicat",
                 type=str,
                 metavar="DIR")
    flags.define("--lang",
                 help="Language to process",
                 default="en",
                 type=str,
                 metavar="LANG")
    flags.define("--min_members",
                 help="Reject categories with less than these many members",
                 default=5,
Ejemplo n.º 13
0
"""
Fetch the Danish Company Registry (CVR) and store the records in a database.
"""

import sys
import requests
import json
import sling
import sling.flags as flags

flags.define("--apikey",
             help="CVR API key file",
             default="local/keys/cvr.txt",
             metavar="FILE")

flags.define("--start",
             help="Start time for fetching CVR updates",
             default=None,
             metavar="YYYY-MM-DD")

flags.define("--end",
             help="End time for fetching CVR updates",
             default=None,
             metavar="YYYY-MM-DD")

flags.define("--cvrdb",
             help="database for storing CVR records",
             default="http://localhost:7070/cvr",
             metavar="DBURL")

flags.parse()
Ejemplo n.º 14
0
#     http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Class for updating wikidata with extracted facts from a record file."""
import pywikibot
import sling
import json
import sys
import datetime
import sling.flags as flags

flags.define("--first", help="first record to update", default=0, type=int)

flags.define("--last",
             help="last record to update",
             default=sys.maxsize,
             type=int)

flags.define("--test",
             help="use test record file",
             default=False,
             action='store_true')

flags.define("--batch",
             help="number of records to update",
             default=3,
             type=int)
Ejemplo n.º 15
0
            ["/trace/_str"])

        # There should be the same number of actions in the step.
        checker.check_eq(len(base_actions), len(expt_actions), \
          "Step %d: # of actions" % i)

      # There should be the same number of steps.
      checker.check_eq(len(base_steps), len(expt_steps), "# of Steps")

    base_reader.close()
    expt_reader.close()


  flags.define('--base',
               help='Base recordio',
               default="",
               type=str,
               metavar='FILE')
  flags.define('--expt',
               help='Expt recordio',
               default="",
               type=str,
               metavar='FILE')
  flags.define('--commons',
               help='Commons',
               default="",
               type=str,
               metavar='FILE')
  flags.define('--diff',
               help='File where sample diff (if any) will be written',
               default="/tmp/diff.txt",
Ejemplo n.º 16
0
#
#     http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Corpus locations"""

import os
import sling.flags as flags

# Command-line flags.
flags.define("--language",
             help="primary language for resources",
             default="en",
             metavar="LANG")

flags.define("--languages",
             help="list of languages to process",
             metavar="LANG,...")

flags.define("--wikidata",
             help="wikidata version",
             default="latest",
             metavar="YYYYMMDD")

flags.define("--wikipedia",
             help="wikipedia version",
             default="latest",
             metavar="YYYYMMDD")
Ejemplo n.º 17
0
# Prints evaluation metrics.
def print_metrics(header, metrics):
  print "\n", header, "metrics"
  print "-" * (len(header) + len("metrics") + 1)
  for metric in ['SPAN', 'FRAME', 'TYPE', 'ROLE', 'SLOT']:
    for name in ['Precision', 'Recall', 'F1']:
      key = metric + "_" + name
      print "  %s: %f" % (key, metrics[key])
    print


if __name__ == '__main__':
  flags.define('--flow',
               help='Flow file',
               default='',
               type=str,
               metavar='FLOW')
  flags.define('--strip',
               help='Output flow file which drops "dev" blobs',
               default='',
               type=str,
               metavar='FLOW')
  flags.define('--training_details',
               help='Print training details or not',
               default=False,
               action='store_true')
  flags.define('--output_commons',
               help='Output file to store commons',
               default='',
               type=str,
Ejemplo n.º 18
0
#
#     http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Datasets shared across workflows."""

import sling.flags as flags
import sling.task.corpora as corpora
from sling.task import *

flags.define("--extra_items",
             help="additional items with info",
             default=None,
             metavar="RECFILES")


class Datasets:
    def __init__(self, wf):
        self.wf = wf

    #---------------------------------------------------------------------------
    # Repository
    #---------------------------------------------------------------------------

    def language_defs(self):
        """Resource for language definitions. This defines the /lang/<lang>
    symbols and has meta information for each language."""
        return self.wf.resource("languages.sling",
Ejemplo n.º 19
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Compare Myelin flow computations with NumPy."""

import sling
import sling.flags as flags
import sling.myelin as myelin
import sling.myelin.simulator as simulator
import numpy as np
import sys
import struct

flags.define("--dt", default=myelin.DT_FLOAT)
flags.define("--test")
flags.define("--thorough", default=False, action='store_true')
flags.define("--repeat", default=1, type=int)
flags.define("--skipdiff", default=False, action='store_true')

flags.parse()
dt = flags.arg.dt

print("Myelin test suite for", dt, flags.arg.cpu)
print()


# Statistics for test runs.
class Test:
    def __init__(self, f):
Ejemplo n.º 20
0
    gold = document.gold
    for index, cascade in enumerate(cascades):
      cascade_gold_sequence = cascade.translate(gold)
      delegate = 0
      cost = 0
      for cascade_gold in cascade_gold_sequence:
        cost += cascade.delegates[delegate].size()
        counts[index][delegate] += 1
        if cascade_gold.is_cascade():
          delegate = cascade_gold.delegate
        else:
          delegate = 0
      costs[index] += cost
  for c, cost, cascade in zip(counts, costs, cascades):
    print "\n", cascade.__class__.__name__, "cost =", cost, "\n", \
      "Delegate invocations:", c, "\n", cascade

if __name__ == '__main__':
  import sling.flags as flags
  flags.define('--commons',
               help='Commons store',
               default='',
               type=str)
  flags.define('--input',
               help='Input corpora',
               default='',
               type=str)
  flags.parse()
  print_cost_estimates(flags.arg.commons, flags.arg.input)

Ejemplo n.º 21
0
#
#     http:#www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Workflow for Wikidata and Wikipedia processing"""

import sling.flags as flags
from sling.task import *
import sling.task.corpora as corpora

flags.define("--index",
             help="index wiki data sets",
             default=False,
             action='store_true')

flags.define("--only_primary_language",
             help="only use wikidata labels from primary language",
             default=False,
             action='store_true')

flags.define("--only_known_languages",
             help="only use wikidata labels from known languages",
             default=False,
             action='store_true')

flags.define("--skip_wikipedia_mapping",
             help="skip wikipedia mapping step",
             default=False,
Ejemplo n.º 22
0
import http.cookiejar
import json
import os
import sys
import re
import requests
import time
import traceback
import urllib.parse
from threading import Thread
from queue import Queue
import sling
import sling.flags as flags

flags.define("--crawldb",
             help="database for crawled news articles",
             default="http://localhost:7070/crawl",
             metavar="URL")

flags.define("--newssites",
             default="data/crawl/newssites.txt",
             help="list of news sites")

flags.define("--cookiedir",
             default="local/cookies",
             help="directory for site-specific cookies")

flags.define("--threads",
             help="number of thread for crawler worker pool",
             default=10,
             type=int,
             metavar="NUM")
Ejemplo n.º 23
0
# Returns true if 'filename' appears in the list of ids in 'allowed_ids'.
def file_allowed(allowed_ids, filename):
    if len(allowed_ids) == 0:
        return True
    _, sep, suffix = filename.partition('data/english/annotations')
    filename = sep + suffix
    return filename in allowed_ids


if __name__ == "__main__":
    import os
    import sys

    flags.define('--input',
                 help='CONLL folder name ending in "annotations"',
                 default='',
                 type=str)
    flags.define('--output',
                 help='Output recordio file',
                 default='/tmp/output.rec',
                 type=str)
    flags.define('--max',
                 help='Maximum number of files to process (-1 for all)',
                 default=-1,
                 type=int)
    flags.define('--summary',
                 help='Output file where the summary will be written.',
                 default='',
                 type=str)
    flags.define(
        '--constituency_schema',
Ejemplo n.º 24
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fetch profile information from Twitter"""

import json
import requests
import sys
import time
import tweepy
import urllib
import sling
import sling.flags as flags

flags.define("--apikeys",
             default="local/keys/twitter.json",
             help="Twitter API key file")

flags.define("--twitterdb",
             help="database for storing Twitter profiles",
             default="http://localhost:7070/twitter",
             metavar="DBURL")

flags.define("--mediadb",
             help="database for storing Twitter profiles pictures",
             default=None,
             metavar="DBURL")

flags.define("--update",
             help="refresh all updated profiles",
             default=False,
Ejemplo n.º 25
0
# limitations under the License.
"""Monitor Wikimedia change stream and update Wikidata database."""

import json
import re
import sys
import requests
import time
from threading import Thread
from queue import Queue
import sling
import sling.flags as flags
from sling.crawl.sse import SSEStream

flags.define("--wiki_changes_stream",
             help="stream for monitoring updates to wikidata",
             default="https://stream.wikimedia.org/v2/stream/recentchange",
             metavar="URL")

flags.define("--since",
             help="retrieve event starting from a specific time",
             default=None,
             metavar="YYYY-MM-DDThh:mm:ssZ")

flags.define("--wiki_fetch_url",
             help="url for fetching items from wikidata",
             default="https://www.wikidata.org/wiki/Special:EntityData",
             metavar="URL")

flags.define("--dburl",
             help="wiki database url for collecting changes",
             default="http://localhost:7070/wikidata",
Ejemplo n.º 26
0
import praw
import json
import traceback
import sys
import time
import sling.crawl.news as news
import sling.flags as flags

flags.define("--apikeys",
             default="local/keys/reddit.json",
             help="Reddit API key file")

flags.parse()

# Consider all submission to these subreddits as news articles.
news_reddits = [
    "AutoNewspaper",
    "nofeenews",
    "newsdk",
    "news",
    "Full_news",
    "qualitynews",
    "worldnews",
    "worldevents",
]

# Ignored subreddits.
ignored_reddits = [
    "u_toronto_news",
    "newsokur",
]
Ejemplo n.º 27
0
Fetch the Companies House company registry using the streaming API.
"""

import json
import requests
import sling
import sling.flags as flags
import sling.crawl.chs as chs
import sys
import time
import traceback
from threading import Thread
from queue import Queue

flags.define("--chskeys",
             help="Companies House API key file",
             default="local/keys/chs.txt",
             metavar="FILE")

flags.define("--chsdb",
             help="database for storing Companies House records",
             default="http://localhost:7070/chs",
             metavar="DBURL")

flags.define("--checkpoint",
             help="File with latest checkpoint",
             default=None,
             metavar="FILE")

flags.define("--checkpoint_interval",
             help="How often checkpoint is written to disk (seconds)",
             default=60,
Ejemplo n.º 28
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Workflow for silver-labeling of Wikipedia articles"""

import os
import sling.flags as flags
import sling.task.corpora as corpora
import sling.task.data as data
from sling.task import *

flags.define("--silver_corpus_size",
             help="maximum number of documents in silver corpus",
             default=None,
             type=int,
             metavar="NUM")

flags.define("--decoder",
             help="parser decoder type",
             default="knolex")

flags.define("--simple_types",
             help="use simple commons store with basic types",
             default=False,
             action="store_true")

flags.define("--subwords",
             help="use subword tokenization",
             default=False,
Ejemplo n.º 29
0
# See the License for the specific language governing permissions and
# limitations under the License.
"""Workflows for downloading wiki dumps and datasets"""

import os
from urllib.request import urlopen
import time

import sling
import sling.task.corpora as corpora
import sling.flags as flags
import sling.log as log
from sling.task.workflow import *

flags.define("--dataurl",
             help="data set site",
             default="https://ringgaard.com/data",
             metavar="URL")

flags.define("--dataset",
             help="list of datasets to fetch",
             default="",
             metavar="LIST")

# Number of concurrent downloads.
download_concurrency = 0


# Task for downloading files.
class UrlDownload:
    def run(self, task):
        # Get task parameters.
Ejemplo n.º 30
0
import datetime
import requests
import sys
import collections
import xml.etree.ElementTree as ET
import sling.flags as flags
import sling.crawl.dnscache
import sling.crawl.news as news

flags.define("--daily",
             default=False,
             action="store_true",
             help="fetch daily news feed from newslookup.com")

flags.define("--hourly",
             default=False,
             action="store_true",
             help="fetch hourly news feed from newslookup.com")

flags.define("--newsites",
             default=False,
             action="store_true",
             help="output new unknown news sites")

flags.define("--file",
             default=None,
             help="fetch news articles from newslookup file")

flags.define("--backupdir",
             default=None,
             help="backup directory for newslookup files")