コード例 #1
0
ファイル: main.py プロジェクト: vidiecan/importer
 def _get_exported( fncs_list ):
     exported = []
     prefix = env["datasets"]["export_prefix"]
     for m in fncs_list:
         if utils.uni(m).startswith(prefix):
             exported.append(utils.uni(m)[len(prefix):])
     return exported
コード例 #2
0
ファイル: main.py プロジェクト: vidiecan/importer
 def _get_exported(fncs_list):
     exported = []
     prefix = env["datasets"]["export_prefix"]
     for m in fncs_list:
         if utils.uni(m).startswith(prefix):
             exported.append(utils.uni(m)[len(prefix):])
     return exported
コード例 #3
0
ファイル: index.py プロジェクト: alokedesai/Word-Connections
def addWord(): 
    global matrix
    inputs = list(json.loads(uni(request.args.get("categories"))))
    if inputs==[]:
	    matrix = []
    w = uni(request.args.get("word"))
    i = compareToCategory(w, inputs, 20, matrix) 
    if i==-1:
	    inputs.append([w])
    else:
    	    inputs[i].append(w)
    return json.dumps(inputs)
コード例 #4
0
ファイル: __init__.py プロジェクト: vidiecan/importer
 def query( self, qfield, qvalue, fields=None, pages_count=10, qf=None ):
     """ Return search result. """
     try:
         kwargs = {}
         if not fields is None:
             kwargs["fl"] = ",".join(fields)
         qvalue = _backend._escape_for_string(qvalue)
         kwargs["q"] = u"%s:%s" % (qfield, qvalue)
         if qf is not None:
             kwargs["qf"] = qf
         kwargs["rows"] = pages_count
         return self._backend.search(**kwargs)
     except Exception, e:
         _logger.exception(u"Could not query backend [%s]." % utils.uni(e))
         _logger_suspicious.exception(u"Could not query backend [%s]." % utils.uni(e))
コード例 #5
0
 def load(self):
     cox = []
     coy = []
     self.ctrh = {}
     movies = {}
     for user, movie, rate, timestamp in movielens_data():
         user = int(user) - 1
         movie = int(movie) - 1
         if movie in movies:
             movies[movie] += 1
         else:
             movies[movie] = 1
         if np.random.uniform(0, 1) < self.train_portion:
             cox.append(user)
             coy.append(movie)
         else:
             if user in self.ctrh:
                 self.ctrh[user].append(movie)
             else:
                 self.ctrh[user] = [movie]
     self.A = sparse.coo_matrix((np.ones(len(cox)), (cox, coy)),
                                shape=(max(list(self.ctrh) + cox) + 1,
                                       max(list(movies) + coy) + 1),
                                dtype=np.float32)
     self.U, self.S, self.VT = svds(self.A, self.d)
     self.A = self.A.astype(np.int32)
     for i in range(self.U.shape[0]):
         self.U[i] = uni(self.U[i])
     self.V = self.VT.T
     for i in range(self.V.shape[0]):
         self.V[i] = uni(self.V[i])
     if self.n_movies is None:
         self.L = len(movies)
     else:
         self.L = self.n_movies
     self.arms = set([
         x[1] for x in sorted([(movies[movie], movie)
                               for movie in movies])[-self.L:]
     ])
     self.users = [
         x[1] for x in sorted([(overlap(self.ctrh[user], self.arms), user)
                               for user in self.ctrh
                               if self.L * self.baseline[0] < overlap(
                                   self.ctrh[user], self.arms) < self.L *
                               self.baseline[1]])
     ]
     logging.info('total {0} users involved'.format(len(self.users)))
     self.d = self.d**2
コード例 #6
0
ファイル: models.py プロジェクト: izderadicka/myplaces
 def __unicode__(self):
     
     addr=[]
     def append_if_exists(item):
         if item:
             addr.append(uni(item))
     append_if_exists(self.street)
     append_if_exists( u'%s %s'% (uni(self.postal_code), uni(self.city)) if self.postal_code else self.city)
     append_if_exists(self.county)
     append_if_exists(self.state)
     append_if_exists(self.country)
         
     if addr:
         return u', '.join(addr)
     else:
         return self.unformatted or ''
コード例 #7
0
ファイル: __init__.py プロジェクト: vidiecan/importer
 def query(self, qfield, qvalue, fields=None, pages_count=10, qf=None):
     """ Return search result. """
     try:
         kwargs = {}
         if not fields is None:
             kwargs["fl"] = ",".join(fields)
         qvalue = _backend._escape_for_string(qvalue)
         kwargs["q"] = u"%s:%s" % (qfield, qvalue)
         if qf is not None:
             kwargs["qf"] = qf
         kwargs["rows"] = pages_count
         return self._backend.search(**kwargs)
     except Exception, e:
         _logger.exception(u"Could not query backend [%s]." % utils.uni(e))
         _logger_suspicious.exception(u"Could not query backend [%s]." %
                                      utils.uni(e))
コード例 #8
0
ファイル: __init__.py プロジェクト: vidiecan/importer
 def delete( self, docs_array ):
     """ Return search result. """
     try:
         self._backend.delete(docs_array)
         return True
     except Exception, e:
         _logger.warning(u"Could not delete from backend [%s]." % utils.uni(e))
コード例 #9
0
ファイル: __init__.py プロジェクト: vidiecan/importer
 def optimise( self ):
     _logger.info("Optimising index.")
     try:
         self._backend.optimize()
         _logger.info("Optimised.")
         return True
     except Exception, e:
         _logger.warning(u"Could not optimise backend [%s]." % utils.uni(e))
コード例 #10
0
 def search( self, query_dict, fields=None, pages_count=10 ):
     """ Return search result. """
     try:
         with adapter.solr_backend(self) as backend:
             resp = backend.search(**query_dict)
             return resp
     except Exception, e:
         _logger.exception(u"Could not query backend [%s]." % utils.uni(e))
コード例 #11
0
ファイル: __init__.py プロジェクト: vidiecan/importer
 def optimise(self):
     _logger.info("Optimising index.")
     try:
         self._backend.optimize()
         _logger.info("Optimised.")
         return True
     except Exception, e:
         _logger.warning(u"Could not optimise backend [%s]." % utils.uni(e))
コード例 #12
0
ファイル: __init__.py プロジェクト: vidiecan/importer
 def delete(self, docs_array):
     """ Return search result. """
     try:
         self._backend.delete(docs_array)
         return True
     except Exception, e:
         _logger.warning(u"Could not delete from backend [%s]." %
                         utils.uni(e))
コード例 #13
0
 def optimise(self, maxSegments=None):
     _logger.info("Optimising index.")
     try:
         with adapter.solr_backend(self) as backend:
             backend.optimize(waitSearcher=None, maxSegments=maxSegments)
             _logger.info("Optimised.")
             return True
     except Exception, e:
         _logger.warning(u"Could not optimise backend [%s]." % utils.uni(e))
コード例 #14
0
 def optimise( self, maxSegments=None ):
     _logger.info("Optimising index.")
     try:
         with adapter.solr_backend(self) as backend:
             backend.optimize(waitSearcher=None, maxSegments=maxSegments)
             _logger.info("Optimised.")
             return True
     except Exception, e:
         _logger.warning(u"Could not optimise backend [%s]." % utils.uni(e))
コード例 #15
0
 def query_generic( self, fields=None, pages_count=10, **kwargs ):
     """ Return search result. """
     try:
         with adapter.solr_backend(self) as backend:
             query = backend.query(**kwargs).paginate(rows=pages_count)
             if fields:
                 query = query.field_limit(fields)
             return query.execute()
     except Exception, e:
         _logger.warning(u"Could not query backend [%s]." % utils.uni(e))
コード例 #16
0
 def query_generic(self, fields=None, pages_count=10, **kwargs):
     """ Return search result. """
     try:
         with adapter.solr_backend(self) as backend:
             query = backend.query(**kwargs).paginate(rows=pages_count)
             if fields:
                 query = query.field_limit(fields)
             return query.execute()
     except Exception, e:
         _logger.warning(u"Could not query backend [%s]." % utils.uni(e))
コード例 #17
0
ファイル: __init__.py プロジェクト: vidiecan/importer
 def commit( self ):
     # be sure we can write
     _logger.info(u"Trying to commit to index.")
     try:
         _logger.info("Committing to index.")
         self._backend.commit()
         _logger.info("Committed.")
         return True
     except Exception, e:
         _logger.warning(u"Could not commit in backend [%s]." % utils.uni(e))
コード例 #18
0
ファイル: __init__.py プロジェクト: vidiecan/importer
 def commit(self):
     # be sure we can write
     _logger.info(u"Trying to commit to index.")
     try:
         _logger.info("Committing to index.")
         self._backend.commit()
         _logger.info("Committed.")
         return True
     except Exception, e:
         _logger.warning(u"Could not commit in backend [%s]." %
                         utils.uni(e))
コード例 #19
0
 def commit( self ):
     # be sure we can write
     #_logger.info("Trying to commit to index.")
     try:
         with adapter.solr_backend(self) as backend:
             #_logger.info("Committing to index.")
             backend.commit()
             #_logger.info("Committed.")
             return True
     except Exception, e:
         _logger.warning(u"Could not commit in backend [%s]." % utils.uni(e))
コード例 #20
0
 def delete( self, docs_array=None, queries=None ):
     """ Return search result. """
     try:
         with adapter.solr_backend(self) as backend:
             if queries is not None:
                 backend.delete(queries=queries)
             else:
                 backend.delete(docs=docs_array)
             return True
     except Exception, e:
         _logger.warning(u"Could not delete from backend [%s]." % utils.uni(e))
コード例 #21
0
 def delete(self, docs_array=None, queries=None):
     """ Return search result. """
     try:
         with adapter.solr_backend(self) as backend:
             if queries is not None:
                 backend.delete(queries=queries)
             else:
                 backend.delete(docs=docs_array)
             return True
     except Exception, e:
         _logger.warning(u"Could not delete from backend [%s]." %
                         utils.uni(e))
コード例 #22
0
ファイル: adapter.py プロジェクト: vidiecan/importer
 def commit( self ):
     # be sure we can write
     from_time, to_time = self.not_writable
     adapter.check_writeable(from_time, to_time)
     _logger.info("Trying to commit to index.")
     try:
         with adapter.solr_backend(self) as backend:
             _logger.info("Committing to index.")
             backend.commit()
             _logger.info("Committed.")
             return True
     except Exception, e:
         _logger.warning(u"Could not commit in backend [%s]." % utils.uni(e))
コード例 #23
0
ファイル: __init__.py プロジェクト: vidiecan/importer
 def add(self, document_s, boosts=None):
     """ Add a document to index. """
     docs = utils.to_array(document_s)
     try:
         for document in docs:
             document.update({"id": self.get_id(document["latex"])})
         self._backend.add(docs, boosts)
         if self.auto_commit:
             self.commit()
         return True
     except Exception, e:
         _logger.exception(u"Could not add document to index\n[%s].",
                           utils.uni(e))
         return False
コード例 #24
0
 def commit(self):
     # be sure we can write
     from_time, to_time = self.not_writable
     adapter.check_writeable(from_time, to_time)
     _logger.info("Trying to commit to index.")
     try:
         with adapter.solr_backend(self) as backend:
             _logger.info("Committing to index.")
             backend.commit()
             _logger.info("Committed.")
             return True
     except Exception, e:
         _logger.warning(u"Could not commit in backend [%s]." %
                         utils.uni(e))
コード例 #25
0
 def load(self):
     cox = []
     coy = []
     self.ctrh = {}
     movies = {}
     for user, movie, rate, timestamp in movielens_data():
         user = int(user) - 1
         movie = int(movie) - 1
         if movie in movies:
             movies[movie] += 1
         else:
             movies[movie] = 1
         if np.random.uniform(0,1) < self.train_portion:
             cox.append(user)
             coy.append(movie)
         else:
             if user in self.ctrh:
                 self.ctrh[user].append(movie)
             else:
                 self.ctrh[user] = [movie]
     self.A = sparse.coo_matrix((np.ones(len(cox)), (cox, coy)), shape=(max(list(self.ctrh) + cox) + 1, max(list(movies) + coy) + 1), dtype=np.float32)
     self.U, self.S, self.VT = svds(self.A, self.d)
     self.A = self.A.astype(np.int32)
     for i in range(self.U.shape[0]):
         self.U[i] = uni(self.U[i])
     self.V = self.VT.T
     for i in range(self.V.shape[0]):
         self.V[i] = uni(self.V[i])
     if self.n_movies is None:
         self.L = len(movies)
     else:
         self.L = self.n_movies
     self.arms = set([x[1] for x in sorted([(movies[movie], movie) for movie in movies])[-self.L:]])
     self.users = [x[1] for x in sorted([(overlap(self.ctrh[user], self.arms), user) for user in self.ctrh if self.L * self.baseline[0] < overlap(self.ctrh[user], self.arms) < self.L * self.baseline[1]])]
     logging.info('total {0} users involved'.format(len(self.users)))
     self.d = self.d ** 2
コード例 #26
0
ファイル: __init__.py プロジェクト: vidiecan/importer
    def _create_doc( self, latex, mathml, convert_js, docs=None, url=None, dataset=None, create_ego=False ):
        doc = {
            "mathml": utils.uni(mathml),
            "latex": latex,
            "latex_len": len(latex),
            "documents": docs,
            "url": url,
            "dataset": utils.to_array(dataset),
        }
        for k in ( "result", "status", "status_code", "log" ):
            if k in convert_js:
                doc[k] = convert_js[k]

        if create_ego:
            doc["ego_math"] = ego_convert(latex, mathml[-1])
        return doc
コード例 #27
0
ファイル: __init__.py プロジェクト: vidiecan/importer
 def add( self, document_s, boosts=None ):
     """ Add a document to index. """
     docs = utils.to_array(document_s)
     try:
         for document in docs:
             document.update({
                 "id": self.get_id(document["latex"])
             })
         self._backend.add(docs, boosts)
         if self.auto_commit:
             self.commit()
         return True
     except Exception, e:
         _logger.exception(u"Could not add document to index\n[%s].",
                          utils.uni(e))
         return False
コード例 #28
0
 def add( self, document, **kwargs ):
     """ Add a document  to index. """
     exc = None
     try:
         with adapter.solr_backend(self) as backend:
             backend.add( [document], **kwargs )
         if self.auto_commit:
             self.commit()
         return True
     except Exception, e:
         import traceback
         traceback.print_exc(file=sys.__stdout__)
         exc = e
         self.errors += 1
         if self.errors > 0:
             self.errors = 0
             _logger.exception(u"Could not add document to index [%s]\n[%s].",
                              document.id_str,
                              utils.uni(e))
             return False
コード例 #29
0
 def add(self, document, **kwargs):
     """ Add a document  to index. """
     exc = None
     try:
         with adapter.solr_backend(self) as backend:
             backend.add([document.dict()], **kwargs)
         if self.auto_commit:
             self.commit()
         return True
     except Exception, e:
         import traceback
         traceback.print_exc(file=sys.__stdout__)
         exc = e
         self.errors += 1
         if self.errors > 0:
             self.errors = 0
             _logger.exception(
                 u"Could not add document to index [%s]\n[%s].",
                 document._values.get(self.id_str,
                                      "no id in document values"),
                 utils.uni(e))
             return False
コード例 #30
0
ファイル: __init__.py プロジェクト: vidiecan/importer
    def _create_doc(self,
                    latex,
                    mathml,
                    convert_js,
                    docs=None,
                    url=None,
                    dataset=None,
                    create_ego=False):
        doc = {
            "mathml": utils.uni(mathml),
            "latex": latex,
            "latex_len": len(latex),
            "documents": docs,
            "url": url,
            "dataset": utils.to_array(dataset),
        }
        for k in ("result", "status", "status_code", "log"):
            if k in convert_js:
                doc[k] = convert_js[k]

        if create_ego:
            doc["ego_math"] = ego_convert(latex, mathml[-1])
        return doc
コード例 #31
0
ファイル: processing.py プロジェクト: vidiecan/importer
def process( env_dict, ftor_to_call, final_ftor ):
    """
    Index function wrapper around analyse_one_page and commit_to_index.

    It either calls them in parallel or sequentially.
  """

    # global exit signaller
    exit_ = multiprocessing.Array(ctypes.c_int, 1, lock=False)
    exit_[0] = 0

    logger.info(u"Reading input from [%s]", env_dict["input"])

    done_set = set()
    if env_dict["indexer"].get( "continue", False ):
        fname = env_dict["indexer"]["continue"]
        if os.path.exists(fname):
            done_set = set( [ utils.uni(x).strip().lower()
                              for x in codecs.open( fname, encoding="utf-8",
                                                    mode="r", errors="ignore" ).readlines() ] )

    def iparameters():
        """ Get iterable params. """
        i = 0
        for file_ in glob.iglob(env_dict["input"]):
            file_ = os.path.abspath(file_)
            if file_.lower() in done_set:
                continue
            no_go = False
            file_basename = os.path.basename(file_)
            for not_acceptable_start in env_dict["exclude"]["file_starts"]:
                if file_basename.startswith(not_acceptable_start):
                    logger.warning(u"Skipping this file (invalid title start) [%s]", file_basename)
                    no_go = True
            if no_go:
                continue
            i += 1
            yield ( env_dict, i, file_ )

    # create pool of slaves if specified
    #
    if env_dict["parallel"]["enabled"] and not env_dict.debug:

        # parallel version
        # - threaded
        # - processed
        #
        max_parallel = env_dict["parallel"]["max"]
        kwargs = {}
        if env_dict["parallel"]["threads"]:
            logger.info("Using threading pool with [%d] max concurrent threads.", max_parallel)
            Pool = ThreadPool
        else:
            Pool = multiprocessing.Pool
            kwargs = {}
            if "maxtasksperchild" in env_dict["parallel"]:
                kwargs["maxtasksperchild"] = env_dict["parallel"]["maxtasksperchild"]
            logger.info("Using process pool with [%d] max concurrent processes, chunk size [%s], [%s].",
                        max_parallel, env_dict["parallel"]["chunksize"], repr(kwargs))

        slaves = Pool(processes=max_parallel,
                      initializer=init_pool,
                      initargs=(exit_,),
                      **kwargs)

        # loop through all files and
        it = slaves.imap(ftor_to_call, iparameters(), chunksize=env_dict["parallel"]["chunksize"])
        slaves.close()
        for _ in it:
            if exit_[0]:
                break
            utils.print_after.step()
        slaves.join()

    # not parallel version
    else:
        logger.info("Executing non parallel version [%s]", "debug=True" if env_dict.debug else "parallel.enabled=False")
        init_pool(exit_)
        for (env, pos, file_) in iparameters():
            ftor_to_call((env_dict, pos, file_))
            utils.print_after.step()
            if pos >= env.count:
                break

    # final break
    #
    if not final_ftor is None:
        final_ftor(env_dict)
コード例 #32
0
ファイル: _math.py プロジェクト: vidiecan/importer
        # vs <annotation id="p1.1.m1.1b" encoding="application/x-tex" xref="p1.1.m1.1.cmml">w</annotation>
        if _math_parser.re_tex_annotation.search(mathml_text) is None:
            # this can mean that we either do not have mathml
            for end_tag in ( "</math>", "</m:math>" ):
                if mathml_text.endswith(end_tag):
                    # we have mathml but no annotation
                    if end_tag == "</math>":
                        annotation = u"<annotation encoding=\"application/x-tex\">%s</annotation>" % latex_math
                    else:
                        annotation = u"<m:annotation encoding=\"application/x-tex\">%s</m:annotation>" % latex_math
                    mathml_text = mathml_text[:-len(end_tag)] + annotation + mathml_text[-len(end_tag):]
                    break



        mathml_text = utils.uni(mathml_text)

        # post processing
        # - converter problem
        #
        for to_remove in ( u"\end{document}", u"nowiki" ):
            if to_remove in mathml_text:
                logger.warn(u"Invalid math [%s]", mathml_text)
                # set to invalid
                mathml_pickled.delete_invalid( latex_math_db_id )
                if try_one_more_if_invalid:
                    return convert_wikimath_to_realmath( env_dict,
                                      wiki_math_match,
                                      mathml_pickled,
                                      url,
                                      doc,
コード例 #33
0
ファイル: processing.py プロジェクト: vidiecan/importer
def process(env_dict, ftor_to_call, final_ftor):
    """
    Index function wrapper around analyse_one_page and commit_to_index.

    It either calls them in parallel or sequentially.
  """

    # global exit signaller
    exit_ = multiprocessing.Array(ctypes.c_int, 1, lock=False)
    exit_[0] = 0

    logger.info(u"Reading input from [%s]", env_dict["input"])

    done_set = set()
    if env_dict["indexer"].get("continue", False):
        fname = env_dict["indexer"]["continue"]
        if os.path.exists(fname):
            done_set = set([
                utils.uni(x).strip().lower() for x in
                codecs.open(fname, encoding="utf-8", mode="r",
                            errors="ignore").readlines()
            ])

    def iparameters():
        """ Get iterable params. """
        i = 0
        for file_ in glob.iglob(env_dict["input"]):
            file_ = os.path.abspath(file_)
            if file_.lower() in done_set:
                continue
            no_go = False
            file_basename = os.path.basename(file_)
            for not_acceptable_start in env_dict["exclude"]["file_starts"]:
                if file_basename.startswith(not_acceptable_start):
                    logger.warning(
                        u"Skipping this file (invalid title start) [%s]",
                        file_basename)
                    no_go = True
            if no_go:
                continue
            i += 1
            yield (env_dict, i, file_)

    # create pool of slaves if specified
    #
    if env_dict["parallel"]["enabled"] and not env_dict.debug:

        # parallel version
        # - threaded
        # - processed
        #
        max_parallel = env_dict["parallel"]["max"]
        kwargs = {}
        if env_dict["parallel"]["threads"]:
            logger.info(
                "Using threading pool with [%d] max concurrent threads.",
                max_parallel)
            Pool = ThreadPool
        else:
            Pool = multiprocessing.Pool
            kwargs = {}
            if "maxtasksperchild" in env_dict["parallel"]:
                kwargs["maxtasksperchild"] = env_dict["parallel"][
                    "maxtasksperchild"]
            logger.info(
                "Using process pool with [%d] max concurrent processes, chunk size [%s], [%s].",
                max_parallel, env_dict["parallel"]["chunksize"], repr(kwargs))

        slaves = Pool(processes=max_parallel,
                      initializer=init_pool,
                      initargs=(exit_, ),
                      **kwargs)

        # loop through all files and
        it = slaves.imap(ftor_to_call,
                         iparameters(),
                         chunksize=env_dict["parallel"]["chunksize"])
        slaves.close()
        for _ in it:
            if exit_[0]:
                break
            utils.print_after.step()
        slaves.join()

    # not parallel version
    else:
        logger.info(
            "Executing non parallel version [%s]",
            "debug=True" if env_dict.debug else "parallel.enabled=False")
        init_pool(exit_)
        for (env, pos, file_) in iparameters():
            ftor_to_call((env_dict, pos, file_))
            utils.print_after.step()
            if pos >= env.count:
                break

    # final break
    #
    if not final_ftor is None:
        final_ftor(env_dict)
コード例 #34
0
class mathml(object):
    """
        MathML object.
    """
    url_form_latex = settings["converters"]["latexml"]["url"]
    encoding = settings["converters"]["encoding"]
    id_str = u' egomath="%s" '
    pattern_id_add = re.compile(
        u'(<math)\s(.*?xmlns="http://www.w3.org/1998/Math/MathML")')
    pattern_id_get = re.compile(id_str % u'(.*?)')

    def __init__(self, mathml_str):
        self.str = mathml_str

    @staticmethod
    def from_latex(latex_math_orig):
        """
            Returns either mathml object or None.
        """
        # try fetching the answer
        js = None
        latex_math = latex_math_orig
        try:
            latex_math = latex(latex_math, full=False).str

            # is empty?
            if len(latex_math.strip()) == 0:
                _logger.warning(u"Warning: empty math - [%s]",
                                repr(latex_math))
                return None, None

            latex_math = u"$ %s $" % latex_math
            # old service req = urllib2.Request(
            # URL, urllib.urlencode({ 'formula' : utils.ascii(latex,DEF_ENCODING) }) )
            # new service
            req = urllib2.Request(
                mathml.url_form_latex,
                urllib.urlencode({
                    'tex': latex_math.encode("utf-8"),
                    'profile': 'math',
                }))
            response = urllib2.urlopen(
                req, timeout=settings["converters"]["latexml"]["timeout"])

            # try parsing the answer
            import json

            js = json.load(response)
            result = js[settings["converters"]["latexml"]["result_field"]]
            message = js[settings["converters"]["latexml"]["status_field"]]
            if result:
                result = result.encode(mathml.encoding)
            if message:
                message = message.encode(mathml.encoding)

        except Exception, e:
            if js is None:
                # fake js
                js = {
                    "result": None,
                    "status": "Problem at early stage.",
                    "status_code": -1,
                    "log": repr(e),
                }
            _logger.error(u"Error: Connection problem - %s with [%s]", repr(e),
                          latex_math)
            return None, js

        everything_ok = False
        for msg in settings["converters"]["latexml"]["status_ok"]:
            if msg in message:
                everything_ok = not message is None and 0 < len(message)
                break
        not_empty_result = result and result != ''
        # everything ok - return answer
        if everything_ok and not_empty_result:
            return mathml(result).str, js

        # something fishy - try to correct it
        ascii_latex = utils.ascii(latex_math, mathml.encoding)
        if everything_ok and not_empty_result and len(ascii_latex) < 6:
            # in case the service returns empty string and it seems to be just a variable
            _logger.warning(u"Warning: returning original - %s",
                            repr(ascii_latex))
            return mathml(ascii_latex).str, js

        # seems not ok but the latest converter returns valid results
        if not everything_ok and not_empty_result:
            _logger.warning(
                u"Warning: returning conversion but with errors - %s",
                repr(ascii_latex))
            return mathml(result).str, js

        _logger.error(
            u"\n!ERROR - converting [%s] -> result [%s] with message [%s]\n%s",
            ascii_latex, utils.uni(result), utils.uni(message), 40 * "=")
        return None, js
コード例 #35
0
    def execute(self, *args, **kwargs):
        """
            Execute the application.
        """
        ret_mime = None

        #self.log( "args [%s], kwargs [%s]", repr(args), repr(kwargs))
        if not cesilko.api_translate in args:
            return self._failed(
                detail="Invalid API - no method with such a name")

        # posted raw body
        if cesilko.api_key_body in kwargs:
            ret_mime = "text/plain"
            try:
                kwargs[cesilko.api_key_data] = utils.uni(self.posted_body())
            except:
                return self._failed(detail="invalid posted body")

        # what should we translate?
        elif 0 == len(kwargs.get(cesilko.api_key_data, "")):
            # hardcoded fallback
            try:
                kwargs[cesilko.api_key_data] = utils.uni(self.posted_body())
                ret_mime = "text/plain"
                self.log("using fallback mechanism")
            except:
                return self._failed(detail="missing data parameter")

        try:
            (input_f,
             input_fname_rel) = self._get_unique_file(enc='iso-8859-2')
            expected_output_file_name = input_f.name + ".SK.out"

            # 1. Input text is in UTF-8
            text = kwargs[cesilko.api_key_data]
            self.log("Received Input Text: %s ", text)
            self.log("Type of the Input: %s", str(type(text)))

            # 2. Convert the UTF-8 encoded text into ISO-8859-2 encoding.
            #    - non ISO-8859-2 characters will be replaced with XML numeric codes
            text_iso_dec = None
            try:
                text_iso = text.encode('iso-8859-2', 'xmlcharrefreplace')
                text_iso_dec = text_iso.decode('iso-8859-2')  # ISO-8859-2 text
                self.log(
                    "Replacing the Non ISO-8859-2 Characters Into XML Numeric Entities: %s",
                    text_iso_dec)
            except UnicodeEncodeError:
                self._failed(detail="please supply utf-8 input.")

            with input_f as fout:
                fout.write(text_iso_dec)
                self.log("Written Input Text to File: %s", fout.name)
            cmd = "%s %s %s" % (cesilko.tr_script, input_f.name,
                                expected_output_file_name)
            self.log("Cesilko ran: [%s]", cmd)
            retcode, stdout, stderr = utils.run(cmd)
            output_exists = os.path.exists(expected_output_file_name)
            if 0 == retcode and os.path.exists(expected_output_file_name):
                with open(expected_output_file_name, 'rb') as fin:
                    translated_text = fin.read()

                    # convert the ISO-8859-2 output text into UTF-8 text
                    #translated_text_dec_utf = translated_text.decode('iso-8859-2').encode('utf-8').decode('utf-8')
                    translated_text_dec_utf = translated_text.decode(
                        'iso-8859-2')

                    # remove extra \n\n at the end of the translated text
                    # Cesilko adds this, so it can be removed safely here
                    translated_text_dec_utf = re.sub(r"\n\n$", "",
                                                     translated_text_dec_utf)

                    # remove extra spaces at the beginning and end
                    translated_text_dec_utf = re.sub(r"(^\s+|\s+$)", "",
                                                     translated_text_dec_utf)

                    self.log("The UTF-8 Encoded Output: %s",
                             translated_text_dec_utf)

                    ret = {"input": text, "result": translated_text_dec_utf}
                    # special for weblicht
                    if ret_mime is not None:
                        return ret_mime, ret["result"]

                    return ret
            else:
                return self._failed(
                    detail=
                    "retcode:%d, exists(%s)=%s, stdout=%s, stderr=%s, cmd=%s" %
                    (retcode, expected_output_file_name, output_exists, stdout,
                     stderr, cmd))

        except Exception, e:
            return self._failed(detail=utils.uni(e))
コード例 #36
0
ファイル: __init__.py プロジェクト: vidiecan/importer
    """
        Convert both formats.
    """
    from indexer.egomath.interface import egomath_inst

    mathml_repre = u"mathml:problem"
    tex_repre = u"tex:problem"
    mathmldone = False

    # noinspection PyUnusedLocal
    try:
        if not mathml_str is None and \
                len(mathml_str) > 0 and \
                mathml_str != mathdb.failed_mathml:
            mathml_repre = egomath_inst.math_from_mathml(mathml_str)
        mathmldone = True
        if not latex_str is None and \
                len(latex_str) > 0:
            tex_repre = egomath_inst.math_from_tex(latex_str)
            latex_str_cleanup = egomath_inst.math_from_tex_cleanup(latex_str)
            if latex_str_cleanup != latex_str:
                _logger.info(u"Changed\n[%s] to\n[%s]", latex_str, latex_str_cleanup)
            pass
        else:
            sys.exit("Fatal error")
    except Exception, e:
        _logger.exception("%s exception [%s]",
                         "MathML" if mathmldone is False else "TeX",
                         latex_str)
    return u"mathml:\n" + utils.uni(mathml_repre), u"tex:\n" + utils.uni(tex_repre)
コード例 #37
0
ファイル: _math.py プロジェクト: vidiecan/importer
        # <m:math display="inline"><m:semantics><m:mi>c</m:mi><m:annotation-xml encoding="MathML-Content"><m:ci>c</m:ci></m:annotation-xml></m:semantics></m:math>
        # vs <annotation id="p1.1.m1.1b" encoding="application/x-tex" xref="p1.1.m1.1.cmml">w</annotation>
        if _math_parser.re_tex_annotation.search(mathml_text) is None:
            # this can mean that we either do not have mathml
            for end_tag in ("</math>", "</m:math>"):
                if mathml_text.endswith(end_tag):
                    # we have mathml but no annotation
                    if end_tag == "</math>":
                        annotation = u"<annotation encoding=\"application/x-tex\">%s</annotation>" % latex_math
                    else:
                        annotation = u"<m:annotation encoding=\"application/x-tex\">%s</m:annotation>" % latex_math
                    mathml_text = mathml_text[:-len(
                        end_tag)] + annotation + mathml_text[-len(end_tag):]
                    break

        mathml_text = utils.uni(mathml_text)

        # post processing
        # - converter problem
        #
        for to_remove in (u"\end{document}", u"nowiki"):
            if to_remove in mathml_text:
                logger.warn(u"Invalid math [%s]", mathml_text)
                # set to invalid
                mathml_pickled.delete_invalid(latex_math_db_id)
                if try_one_more_if_invalid:
                    return convert_wikimath_to_realmath(
                        env_dict,
                        wiki_math_match,
                        mathml_pickled,
                        url,
コード例 #38
0
ファイル: __init__.py プロジェクト: vidiecan/importer
    """
    from indexer.egomath.interface import egomath_inst

    mathml_repre = u"mathml:problem"
    tex_repre = u"tex:problem"
    mathmldone = False

    # noinspection PyUnusedLocal
    try:
        if not mathml_str is None and \
                len(mathml_str) > 0 and \
                mathml_str != mathdb.failed_mathml:
            mathml_repre = egomath_inst.math_from_mathml(mathml_str)
        mathmldone = True
        if not latex_str is None and \
                len(latex_str) > 0:
            tex_repre = egomath_inst.math_from_tex(latex_str)
            latex_str_cleanup = egomath_inst.math_from_tex_cleanup(latex_str)
            if latex_str_cleanup != latex_str:
                _logger.info(u"Changed\n[%s] to\n[%s]", latex_str,
                             latex_str_cleanup)
            pass
        else:
            sys.exit("Fatal error")
    except Exception, e:
        _logger.exception("%s exception [%s]",
                          "MathML" if mathmldone is False else "TeX",
                          latex_str)
    return u"mathml:\n" + utils.uni(mathml_repre), u"tex:\n" + utils.uni(
        tex_repre)
コード例 #39
0
    def page_to_template( self, page, template ):
        # get basic info
        #
        pattern = re.compile(
            r'<title>(?P<title>.*?)</title>.*<id>(?P<id>.*?)</id>.*<revision.*<text[^>]*>(?P<text>.*?)</text>',
            re.DOTALL)
        keywords = {
            u"title": None,
            u"id": None,
            u"text": None,
        }
        m = pattern.search(page)
        if not m:
            pager.logger.error(u"Invalid page: could not find elements... %s", page)
            return template
        else:
            assert page.count( u"<title>" ) == 1, u"Matched more pages?"
            for k, v in keywords.iteritems():
                keywords[k] = m.group(k)

        if utils.uni(keywords[u"id"]) in settings["exclude"]["ids"]:
            logger_suspicious.warning(u"Skipping this file (id in excludes)... %s [%s]",
                                 keywords[u"title"], keywords[u"id"])
            return None

        for not_acceptable_start in settings["exclude"]["title_starts"]:
            if keywords[u"title"].startswith(not_acceptable_start):
                logger_suspicious.warning(u"Skipping this file (invalid title start)... %s [%s]",
                                     keywords[u"title"], keywords[u"id"])
                return None

        # clean up text
        # - get math positions (do not clean text inside them)
        # - split text and clean up tokens between math
        #
        from _parser import parser as wikiparser
        text = wikiparser.remove_wiki_tags_outside_math(keywords[u"text"])
        keywords[u"text"] = text

        # get additional info
        #
        category = re.compile(r'\[\[Category:(.+)\]\]').findall(page)
        keywords["category"] = self.delimiter.join(map(lambda x: x.replace(self.delimiter, " "), category))
        keywords["url"] = u"http://en.wikipedia.org/wiki/%s" % keywords[u"title"].replace(u" ", u"_")

        #
        #
        lang_avail = []
        lines = page.strip().split("\n")
        lang_pattern = re.compile(r'\[\[([a-z].+?):.*\]\]')
        for i in range(len(lines) - 1, 0, -1):
            m = lang_pattern.match(lines[i])
            if m:
                lang_avail.append(m.group(1))
            else:
                if lines[i].strip() == "":
                    break
                elif not lines[i].strip().startswith(u"<"):
                    break
        keywords["lang_avail"] = self.delimiter.join([x.replace(self.delimiter, " ") for x in lang_avail])

        #
        #
        keywords["citations_count"] = page.count(u"&lt;ref&gt;")

        refs = u""

        # problems:
        # [[File:Albedo-e hg.svg|thumb|Percentage of diffusely reflected sun light in
        #    relation to various surface conditions of the Earth]]
        #
        #
        def change_to_link( text ):
            """ Very simple text to link changer. """
            text = text.split(u"#")[0]
            if len(text) > 0:
                text = text[0].upper() + text[1:].replace(u" ", "_")
                return text
            return u""

        for cita in re.compile(r'\[\[([^:\]|]+?)\]\]').findall(page):
            refs += '<meta name="refs" content="%s" />\n' % change_to_link(cita)
        for cita in re.compile(r'\[\[([^:|]+?)\]\]').findall(page):
            refs += '<meta name="refs" content="%s" />\n' % change_to_link(cita)
        keywords["refs"] = refs

        page = None  # memory

        # substitute it
        #
        m = None  # memory
        try:
            return utils.subst_str_nonrecursive(template, keywords)
        except MemoryError, e:
            self.logger.exception(u"Memory exception - %s", repr(e))
コード例 #40
0
ファイル: _parser.py プロジェクト: vidiecan/importer
    def parse(self, text, page_str):
        global _egomath_inst
        # if math.mathml_db is None:
        #     math.mathml_db = _math.mathdb( _settings_local )

        id_str = page_str[180:260].replace("\n", " ")

        math_field = []
        positions = parser.positions_in_text(
            page_str,
            (_settings_local["pager"]["wiki_mathml_tags"], False),
            (_settings_local["pager"]["wiki_mathml_tags_v2"], False),
            (_settings_local["pager"]["wiki_math_tags"], True),
        )
        positions.sort(key=lambda x: x[0])

        tex_start = _settings_local["pager"]["tex_start"]
        math_sep = _settings_local["pager"]["math_sep"]

        # find all maths in the parsed page and match them with their
        # counterpart in the original text (get either tex or mathml)
        #
        for i, match in enumerate(math.math_final_matcher.finditer(text)):
            start_pos = max(
                match.start(0) -
                _settings_global["indexer"]["snippet_chars"] / 2, 0)
            end_pos = min(
                match.end(0) + _settings_global["indexer"]["snippet_chars"],
                len(text) - 1)

            # invalid math
            if len(positions) <= i:
                pass
            if positions[i][0] == -1:
                continue
            math_text = positions[i][1]
            math_representation = u""
            #logger.info( u"Working on [%s] len [%s][%s]", math_text, i, len(math_text) )
            if len(math_text) > 50000:
                logger_suspicious.warning(u"Math too big [%s] in [%s]",
                                          len(math_text), id_str)
                pass
            if len(math_text) > 0:
                try:
                    # mathml version
                    #
                    if math_text.startswith(
                            u"<m:math") or math_text.startswith(u"<math"):
                        math_tex_tmp = None
                        _bug_oneword = False
                        m = math.re_tex_annotation.search(math_text)
                        if not m:
                            logger_suspicious.warning(
                                "Did not find annotation in tex! [%s]", id_str)
                        else:
                            # do a bit of html->normal cleanup
                            math_tex_tmp = unescape_recursive(m.group(1))
                            # must be after unescape
                            math_tex_tmp = self.texify(math_tex_tmp)
                            _bug_oneword = re.compile(u"^[a-z]+$", re.U).match(
                                math_tex_tmp.lower())
                            math_text = math_text.replace(
                                m.group(1), math_tex_tmp)

                        math_representation = _egomath_inst.math_from_mathml(
                            math_text)

                        # FIX one word
                        if _bug_oneword and math_representation.count("*") > 0:
                            math_representation = u"Tex: %s\nego0 : %s\nego8 : id" % (
                                math_tex_tmp, math_tex_tmp)
                            logger_suspicious.debug(u"Fixing oneword [%s]",
                                                    math_tex_tmp)

                        # Log problem
                        if math_representation is None:
                            logger_suspicious.warning(
                                u"NULL returned from egomath [\n%s\n] in [%s]",
                                math_text.replace("\n",
                                                  " ").replace("\"", "\\\""),
                                id_str)

                        if math_tex_tmp is not None and (
                                math_representation is None
                                or 0 == len(math_representation)):
                            # try latex...
                            math_representation = _egomath_inst.math_from_tex(
                                math_tex_tmp)

                        if math_representation is None or 0 == len(
                                math_representation):
                            logger_suspicious.warning(
                                u"Empty math returned from egomath [\n%s\n] in [%s]",
                                math_text.replace("\n",
                                                  " ").replace("\"", "\\\""),
                                id_str)

                            # if not math.mathml_db is None:
                            #     i = mathml.get_id( math_text )
                            #     if not i is None:
                            #         # add it to db
                            #         math.mathml_db.add_ego_math( math_representation, i )

                    # tex version
                    #
                    else:
                        # do a bit of html->normal cleanup
                        math_text = unescape_recursive(math_text)
                        math_text = self.texify(math_text, leave_nl=True)
                        # do the conversion
                        math_representation = _egomath_inst.math_from_tex(
                            math_text)
                        # simulate text repre
                        math_text_tex = math_text.replace("\n", " ").strip()
                        math_representation = "Tex: %s\n" % math_text_tex + math_representation

                except Exception, e:
                    logger.exception(u"Cannot convert [%s] because of [%s]",
                                     math_text, utils.uni(e))
            else:
                logger_suspicious.info(u"Empty math in [%s]", id_str)

            if math_representation is None or len(math_representation) == 0:
                continue
                # convert Tex: line to ""showaeble"" tex
            # convert \d: line to math + end token so we can simulate full match
            #
            already_in = set()
            result_math = u""
            for line in math_representation.split("\n"):
                line = line.strip()
                if len(line) == 0:
                    continue
                    # deduplicity - already there
                if line in already_in:
                    continue
                already_in.add(line)

                # insert proper tags
                if not line.startswith(tex_start):
                    line = u"%s %s" % (line, math_sep[1])
                else:
                    line = u"%s %s %s %s" % (tex_start, math_sep[0],
                                             line[len(tex_start):],
                                             math_sep[1])
                result_math += u"%s\n" % line

            # ensure we get all formulae unparsed
            #
            if len(result_math) > 0:
                #math_field += self.snippet_populate(text, result_math, match, start_pos, end_pos)
                math_field += [
                    self.snippet_populate(text, result_math, match, start_pos,
                                          end_pos)
                ]
コード例 #41
0
ファイル: _parser.py プロジェクト: vidiecan/importer
    def parse(self, text, page_str):
        global _egomath_inst
        # if math.mathml_db is None:
        #     math.mathml_db = _math.mathdb( _settings_local )

        id_str = page_str[180:260].replace("\n", " ")

        math_field = []
        positions = parser.positions_in_text(page_str,
                                             (_settings_local["pager"]["wiki_mathml_tags"], False),
                                             (_settings_local["pager"]["wiki_mathml_tags_v2"], False),
                                             (_settings_local["pager"]["wiki_math_tags"], True),
        )
        positions.sort(key=lambda x: x[0])

        tex_start = _settings_local["pager"]["tex_start"]
        math_sep = _settings_local["pager"]["math_sep"]

        # find all maths in the parsed page and match them with their
        # counterpart in the original text (get either tex or mathml)
        #
        for i, match in enumerate(math.math_final_matcher.finditer(text)):
            start_pos = max(match.start(0) - _settings_global["indexer"]["snippet_chars"] / 2, 0)
            end_pos = min(match.end(0) + _settings_global["indexer"]["snippet_chars"], len(text) - 1)

            # invalid math
            if len(positions) <= i:
                pass
            if positions[i][0] == -1:
                continue
            math_text = positions[i][1]
            math_representation = u""
            #logger.info( u"Working on [%s] len [%s][%s]", math_text, i, len(math_text) )
            if len(math_text) > 50000:
                logger_suspicious.warning(u"Math too big [%s] in [%s]", len(math_text), id_str)
                pass
            if len(math_text) > 0:
                try:
                    # mathml version
                    #
                    if math_text.startswith(u"<m:math") or math_text.startswith(u"<math"):
                        math_tex_tmp = None
                        _bug_oneword = False
                        m = math.re_tex_annotation.search(math_text)
                        if not m:
                            logger_suspicious.warning("Did not find annotation in tex! [%s]", id_str)
                        else:
                            # do a bit of html->normal cleanup
                            math_tex_tmp = unescape_recursive(m.group(1))
                            # must be after unescape
                            math_tex_tmp = self.texify(math_tex_tmp)
                            _bug_oneword = re.compile(u"^[a-z]+$", re.U).match(math_tex_tmp.lower())
                            math_text = math_text.replace(m.group(1), math_tex_tmp)

                        math_representation = _egomath_inst.math_from_mathml(math_text)

                        # FIX one word
                        if _bug_oneword and math_representation.count("*") > 0:
                            math_representation = u"Tex: %s\nego0 : %s\nego8 : id" % (math_tex_tmp, math_tex_tmp)
                            logger_suspicious.debug(u"Fixing oneword [%s]", math_tex_tmp)

                        # Log problem
                        if math_representation is None:
                            logger_suspicious.warning(u"NULL returned from egomath [\n%s\n] in [%s]",
                                                      math_text.replace("\n", " ").replace("\"", "\\\""),
                                                      id_str)

                        if math_tex_tmp is not None and (math_representation is None or 0 == len(math_representation)):
                            # try latex...
                            math_representation = _egomath_inst.math_from_tex(math_tex_tmp)

                        if math_representation is None or 0 == len(math_representation):
                            logger_suspicious.warning(u"Empty math returned from egomath [\n%s\n] in [%s]",
                                                      math_text.replace("\n", " ").replace("\"", "\\\""),
                                                      id_str)

                            # if not math.mathml_db is None:
                            #     i = mathml.get_id( math_text )
                            #     if not i is None:
                            #         # add it to db
                            #         math.mathml_db.add_ego_math( math_representation, i )

                    # tex version
                    #
                    else:
                        # do a bit of html->normal cleanup
                        math_text = unescape_recursive(math_text)
                        math_text = self.texify(math_text, leave_nl=True)
                        # do the conversion
                        math_representation = _egomath_inst.math_from_tex(math_text)
                        # simulate text repre
                        math_text_tex = math_text.replace("\n", " ").strip()
                        math_representation = "Tex: %s\n" % math_text_tex + math_representation

                except Exception, e:
                    logger.exception(u"Cannot convert [%s] because of [%s]",
                                     math_text, utils.uni(e))
            else:
                logger_suspicious.info(u"Empty math in [%s]", id_str)

            if math_representation is None or len(math_representation) == 0:
                continue
                # convert Tex: line to ""showaeble"" tex
            # convert \d: line to math + end token so we can simulate full match
            #
            already_in = set()
            result_math = u""
            for line in math_representation.split("\n"):
                line = line.strip()
                if len(line) == 0:
                    continue
                    # deduplicity - already there
                if line in already_in:
                    continue
                already_in.add(line)

                # insert proper tags
                if not line.startswith(tex_start):
                    line = u"%s %s" % (line, math_sep[1])
                else:
                    line = u"%s %s %s %s" % (tex_start, math_sep[0],
                                             line[len(tex_start):],
                                             math_sep[1])
                result_math += u"%s\n" % line

            # ensure we get all formulae unparsed
            #
            if len(result_math) > 0:
                #math_field += self.snippet_populate(text, result_math, match, start_pos, end_pos)
                math_field += [self.snippet_populate(text, result_math, match, start_pos, end_pos)]
コード例 #42
0
ファイル: models.py プロジェクト: izderadicka/myplaces
 def append_if_exists(item):
     if item:
         addr.append(uni(item))