def check_lang(text, max_len=2000):
    t = strip_tags(text[:max_len])
    lang = _check_lang(t)

    if lang == "english":
        ld = LangDetect()
        lang = ld.detect(t)

    return lang
Example #2
0
 def __init__(self, filename = "/run/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/pynews/stream/clusteringData.db"):
     logger.info("Start building " + self.__class__.__name__)
     self.__mutex = threading.Semaphore()
     data = shelve.open(filename, protocol=-1, flag="r")
     langDetect = LangDetect.instance()
     vectors = [features(item["text"]) for digest, item in data.items() if item["text"] and item["text"] != "ERROR" and langDetect.detect(item["text"]) is "en"]
     self.__maxV = calcDiv(vectors)
     #vectors = normalize(vectors, self.__maxV)
     means = [array([10, 40, 0, 1]), array([30, 340, 2, 30]), array([120, 1500, 15, 50])]
     self.__clusterer = cluster.KMeansClusterer(3, euclidean_distance, initial_means=means, avoid_empty_clusters=True)
     self.__clusterer.cluster(vectors)
     klassIdToSize = {"0": 0, "1": 0, "2": 0}
     klassIdToWordsCount = {"0": 0, "1": 0, "2": 0}
     for item in data.itervalues():
         text = item["text"]
         if text and text != "ERROR":
             feat = features(text)
             #feat = normalize(feat, self.__maxV)
             klass = str(self.__clusterer.classify(feat))
             klassIdToSize[klass] += 1
             klassIdToWordsCount[klass] += len(text.split())
     data.close()
     results = []
     for klassId in ["0", "1", "2"]:
         meanWordsInKlass = klassIdToWordsCount[klassId] / klassIdToSize[klassId] if klassIdToSize[klassId] != 0 else 0
         results.append({"klass": klassId, "mean" : meanWordsInKlass})
     logger.info("Clustering results: " + str(results))
     sortedKlass = sorted(results, lambda x,y: x["mean"] < y["mean"])
     self.__klassIdToLabel = {klassIdWithLabel[0]: klassIdWithLabel[1] for klassIdWithLabel in zip([item["klass"] for item in sortedKlass], ["short", "medium", "long"])}
Example #3
0
 def __init__(self, mainDir, input, inlinedWebpageDir):
     self.__mainDir = mainDir
     self.__input = input
     self.__langId = LangDetect.instance()
     self.__inlinedWebpageDir = inlinedWebpageDir
     if not os.path.exists(self.__inlinedWebpageDir):
         os.makedirs(self.__inlinedWebpageDir)
     htmlsDir = os.path.join(self.__inlinedWebpageDir, "htmls");
     if not os.path.exists(htmlsDir):
         os.makedirs(htmlsDir)
     data = shelve.open(self.__input)
     self.__data = []
     self.__classes = set([self.defaultClass()])
     url2klass = self.__readKlassFile()
     logger.info("Read shelve...")
     for item in data.itervalues():
         text = item["text"]
         url = item["url"]
         klass = self.__getKlass(url2klass, url)
         if not self.__ignorable(text, url):
             self.__data.append(RowModel(url, text, klass, self))
             if klass:
                 self.__classes.add(klass)
     logger.info("Done " + str(len(self.__data)))
     Publisher.subscribe(self._onSave, "model.save")
     self.__downloader = UrlDownloaderController(self)
     self.__downloader.start()
Example #4
0
 def setTextAndHtmlAndUrl(self, text, html, url):
     if text is None:
         raise ValueError("Text is None!")
     if html is None:
         raise ValueError("HTML is None!")
     if url is None:
         raise ValueError("URL is None!")
     logger.info(u"Url " + self.__realUrl + u" resolved")
     self.__text = text
     self.__html = html
     if self.__realUrl != url:
         logger.info(u"Redirected from \"" + self.__realUrl + u"\" to \"" + url + u"\"")
     self.__realUrl = url
     try:
         self.__lang = LangDetect.instance().detect(text) if text else None
     except BaseException as e:
         logger.exception(u"lang detect error: " + unicode(text))
         raise e
Example #5
0
        sortedKlass = sorted(results, lambda x,y: x["mean"] < y["mean"])
        self.__klassIdToLabel = {klassIdWithLabel[0]: klassIdWithLabel[1] for klassIdWithLabel in zip([item["klass"] for item in sortedKlass], ["short", "medium", "long"])}

    def classify(self, document):
        try:
            self.__mutex.acquire()
            feat = features(document)
            #feat = normalize(feat, self.__maxV)
            docClass = self.__clusterer.classify(feat)
            return self.__klassIdToLabel[str(docClass)]
        finally:
            self.__mutex.release()

if __name__ == "__main__":
    c = DocumentSizeClustering()
    langDetect = LangDetect.instance()
    data = shelve.open("/run/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/pynews/stream/clusteringData.db", protocol=-1, flag="r")
    print "Documents: " + str(len(data))
    position = 0
    labels = {"short": 0, "medium": 0, "long": 0}
    input = []
    for digest, item in data.items():
        if item["text"] and item["text"] != "ERROR" and langDetect.detect(item["text"]) is "en":
            input.append(item)
    testItems = input
    #testItems = []
    #for i in range(0, 150):
    #    e = choice(input)
    #    input.remove(e)
    #    testItems.append(e)