Ejemplo n.º 1
0
 def __init__(self, srclang, targetlang, service, model):
     self.service = service
     self.contentprocessor = ContentProcessor(
         srclang,
         targetlang,
         sourcebpe=self.service.get('sourcebpe'),
         targetbpe=self.service.get('targetbpe'),
         sourcespm=self.service.get('sourcespm'),
         targetspm=self.service.get('targetspm'))
     self.worker = model
     # becomes nonempty if there are multiple target languages
     self.preamble = ""
Ejemplo n.º 2
0
 def __init__(self, srclang, targetlang, service):
     self.q = queues.Queue()
     # Service definition
     self.service = service
     self.p = None
     self.contentprocessor = ContentProcessor(
         srclang,
         targetlang,
         sourcebpe=self.service.get('sourcebpe'),
         targetbpe=self.service.get('targetbpe'),
         sourcespm=self.service.get('sourcespm'),
         targetspm=self.service.get('targetspm'))
     self.ws_url = "ws://{}:{}/translate".format(self.service['host'],
                                                 self.service['port'])
     if self.service['configuration']:
         self.run()
Ejemplo n.º 3
0
        x = x.strip()
        y = y.strip()
        z = z.strip()
        pairs.append((x, y, z))
# Filer out the sentences with less than 5 tokens or larger than 120 tokens
for i in range(len(pairs) - 1, -1, -1):
    if len(pairs[i][0].split()) > 120 or len(pairs[i][0].split()) <= 4:
        pairs.pop(i)
# Load preprocessor
services = {}
with open("service.json", 'r') as configfile:
    services = json.load(configfile)
config = services[src][trg]
contentprocessor = ContentProcessor(src,
                                    trg,
                                    sourcebpe=config.get('sourcebpe'),
                                    targetbpe=config.get('targetbpe'),
                                    sourcespm=config.get('sourcespm'),
                                    targetspm=config.get('targetspm'))

with open("{}_en_pairs.csv".format(src), "a", newline='') as datacsv:
    csvwriter = csv.writer(datacsv, dialect=("excel"))
    csvwriter.writerow(["score", src, "en"])
    for s, t, score in pairs:
        csvwriter.writerow([score, s, t])

sentences = [contentprocessor.preprocess(pair[0]) for pair in pairs]
with open('input_{}.txt'.format(src), 'w') as f:
    for _list in sentences:
        for _string in _list:
            f.write(_string + ' ')
        f.write('\n')
Ejemplo n.º 4
0
from query import CrawlerDb
from content_processor import ContentProcessor
from settings import LOGGING
import sys, urlparse, urllib2, shutil, glob, robotparser
import logging, logging.config
import traceback

# ===== Init stuff =====

# db init
cdb = CrawlerDb()
cdb.connect()

# content processor init
processor = ContentProcessor(None, None, None)

# logging setup
logging.config.dictConfig(LOGGING)
logger = logging.getLogger("crawler_logger")

# robot parser init
robot = robotparser.RobotFileParser()

if len(sys.argv) < 2:
    logger.info("Error: No start url was passed")
    sys.exit()

l = sys.argv[1:]

cdb.enqueue(l)
Ejemplo n.º 5
0
 def __init__(self, *args, **kwargs):
     super(GeneralSpider, self).__init__(*args, **kwargs)
     self.content_processor = ContentProcessor()