Beispiel #1
0
class TranslatorInterface():
    """An interface to a single, possibly multilingual, model."""
    def __init__(self, srclang, targetlang, service, model):
        self.service = service
        self.contentprocessor = ContentProcessor(
            srclang,
            targetlang,
            sourcebpe=self.service.get('sourcebpe'),
            targetbpe=self.service.get('targetbpe'),
            sourcespm=self.service.get('sourcespm'),
            targetspm=self.service.get('targetspm'))
        self.worker = model
        # becomes nonempty if there are multiple target languages
        self.preamble = ""

    def translate(self, text):
        sentences = self.contentprocessor.preprocess(text)
        translatedSentences = self.worker.translate(self.preamble +
                                                    '\n'.join(sentences))
        translation = self.contentprocessor.postprocess(translatedSentences)
        return ' '.join(translation)

    def ready(self):
        return self.worker != None and self.worker.ready()

    def on_exit(self):
        if self.worker != None:
            self.worker.on_exit()
Beispiel #2
0
class TranslatorWorker():
    def __init__(self, srclang, targetlang, service):
        self.q = queues.Queue()
        # Service definition
        self.service = service
        self.p = None
        self.contentprocessor = ContentProcessor(
            srclang,
            targetlang,
            sourcebpe=self.service.get('sourcebpe'),
            targetbpe=self.service.get('targetbpe'),
            sourcespm=self.service.get('sourcespm'),
            targetspm=self.service.get('targetspm'))
        self.ws_url = "ws://{}:{}/translate".format(self.service['host'],
                                                    self.service['port'])
        if self.service['configuration']:
            self.run()

    @gen.coroutine
    def run(self):
        process.Subprocess.initialize()
        self.p = process.Subprocess([
            'marian-server',
            '-c',
            self.service['configuration'],
            '-p',
            self.service['port'],
            '--allow-unk',
            # enables translation with a mini-batch size of 64, i.e. translating 64 sentences at once, with a beam-size of 6.
            '-b',
            '6',
            '--mini-batch',
            '64',
            # use a length-normalization weight of 0.6 (this usually increases BLEU a bit).
            '--normalize',
            '0.6',
            '--maxi-batch-sort',
            'src',
            '--maxi-batch',
            '100',
        ])
        self.p.set_exit_callback(self.on_exit)
        ret = yield self.p.wait_for_exit()

    def on_exit(self):
        print("Process exited")

    def translate(self, srctxt):
        ws = websocket.create_connection(self.ws_url)
        sentences = self.contentprocessor.preprocess(srctxt)
        ws.send('\n'.join(sentences))
        translatedSentences = ws.recv().split('\n')
        ws.close()
        translation = self.contentprocessor.postprocess(translatedSentences)
        return ' '.join(translation)
Beispiel #3
0
 def __init__(self, srclang, targetlang, service, model):
     self.service = service
     self.contentprocessor = ContentProcessor(
         srclang,
         targetlang,
         sourcebpe=self.service.get('sourcebpe'),
         targetbpe=self.service.get('targetbpe'),
         sourcespm=self.service.get('sourcespm'),
         targetspm=self.service.get('targetspm'))
     self.worker = model
     # becomes nonempty if there are multiple target languages
     self.preamble = ""
Beispiel #4
0
 def __init__(self, srclang, targetlang, service):
     self.q = queues.Queue()
     # Service definition
     self.service = service
     self.p = None
     self.contentprocessor = ContentProcessor(
         srclang,
         targetlang,
         sourcebpe=self.service.get('sourcebpe'),
         targetbpe=self.service.get('targetbpe'),
         sourcespm=self.service.get('sourcespm'),
         targetspm=self.service.get('targetspm'))
     self.ws_url = "ws://{}:{}/translate".format(self.service['host'],
                                                 self.service['port'])
     if self.service['configuration']:
         self.run()
class GeneralSpider(Spider):
    name = 'general'

    def __init__(self, *args, **kwargs):
        super(GeneralSpider, self).__init__(*args, **kwargs)
        self.content_processor = ContentProcessor()

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cls(*args, **kwargs)
        spider._set_crawler(crawler)
        spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
        return spider

    def spider_idle(self):
        self.log("Spider idle signal caught.")
        raise DontCloseSpider

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return
        pc = self.content_processor.process_response(response)
        for link in pc.links:
            r = Request(url=link.url)
            r.meta.update(link_text=link.text)
            yield r
Beispiel #6
0
class TranslatorWorker():

    def __init__(self, srclang, targetlang, service):
        self.q = queues.Queue()
        # Service definition
        self.service = service
        self.p = None
        self.contentprocessor = ContentProcessor(
            srclang,
            targetlang,
            sourcebpe=self.service.get('sourcebpe'),
            targetbpe=self.service.get('targetbpe'),
            sourcespm=self.service.get('sourcespm'),
            targetspm=self.service.get('targetspm')
        )
        self.ws_url = "ws://{}:{}/translate".format(
            self.service['host'], self.service['port'])
        if self.service['configuration']:
            self.run()

    @gen.coroutine
    def run(self):
        process.Subprocess.initialize()
        self.p = process.Subprocess(['marian-server', '-c',
                                     self.service['configuration'],
                                     '--quiet-translation',
                                     '-p', self.service['port']])
        self.p.set_exit_callback(self.on_exit)
        ret = yield self.p.wait_for_exit()

    def on_exit(self):
        print("Process exited")

    def translate(self, srctxt):
        ws = websocket.create_connection(self.ws_url)
        sentences = self.contentprocessor.preprocess(srctxt)
        translatedSentences = []
        for sentence in sentences:
            ws.send(sentence)
            translatedSentences.append(ws.recv())
        ws.close()
        translation = self.contentprocessor.postprocess(translatedSentences)
        return ' '.join(translation)
Beispiel #7
0
        x = x.strip()
        y = y.strip()
        z = z.strip()
        pairs.append((x, y, z))
# Filer out the sentences with less than 5 tokens or larger than 120 tokens
for i in range(len(pairs) - 1, -1, -1):
    if len(pairs[i][0].split()) > 120 or len(pairs[i][0].split()) <= 4:
        pairs.pop(i)
# Load preprocessor
services = {}
with open("service.json", 'r') as configfile:
    services = json.load(configfile)
config = services[src][trg]
contentprocessor = ContentProcessor(src,
                                    trg,
                                    sourcebpe=config.get('sourcebpe'),
                                    targetbpe=config.get('targetbpe'),
                                    sourcespm=config.get('sourcespm'),
                                    targetspm=config.get('targetspm'))

with open("{}_en_pairs.csv".format(src), "a", newline='') as datacsv:
    csvwriter = csv.writer(datacsv, dialect=("excel"))
    csvwriter.writerow(["score", src, "en"])
    for s, t, score in pairs:
        csvwriter.writerow([score, s, t])

sentences = [contentprocessor.preprocess(pair[0]) for pair in pairs]
with open('input_{}.txt'.format(src), 'w') as f:
    for _list in sentences:
        for _string in _list:
            f.write(_string + ' ')
        f.write('\n')
Beispiel #8
0
from query import CrawlerDb
from content_processor import ContentProcessor
from settings import LOGGING
import sys, urlparse, urllib2, shutil, glob, robotparser
import logging, logging.config
import traceback

# ===== Init stuff =====

# db init
cdb = CrawlerDb()
cdb.connect()

# content processor init
processor = ContentProcessor(None, None, None)

# logging setup
logging.config.dictConfig(LOGGING)
logger = logging.getLogger("crawler_logger")

# robot parser init
robot = robotparser.RobotFileParser()

if len(sys.argv) < 2:
	logger.info("Error: No start url was passed")
	sys.exit()

l = sys.argv[1:]

cdb.enqueue(l)
Beispiel #9
0
from query import CrawlerDb
from content_processor import ContentProcessor
from settings import LOGGING
import sys, urlparse, urllib2, shutil, glob, robotparser
import logging, logging.config
import traceback

# ===== Init stuff =====

# db init
cdb = CrawlerDb()
cdb.connect()

# content processor init
processor = ContentProcessor(None, None, None)

# logging setup
logging.config.dictConfig(LOGGING)
logger = logging.getLogger("crawler_logger")

# robot parser init
robot = robotparser.RobotFileParser()

if len(sys.argv) < 2:
    logger.info("Error: No start url was passed")
    sys.exit()

l = sys.argv[1:]

cdb.enqueue(l)
 def __init__(self, *args, **kwargs):
     super(GeneralSpider, self).__init__(*args, **kwargs)
     self.content_processor = ContentProcessor()