Beispiel #1
0
def main():
    from sx_storage import sxDBStorage
    dbsx = sxDBStorage()
    dbsx.ConnectMaps()
    fireNews = dbsx.LoadUnclassifiedFireNews()
    for fn in fireNews:
        news = News(fn["title"], fn["body"])
        predict = classify_tuple(news)
        dbsx.UpdateFireNewsClass(fn["id"], predict)
        #print(news, predict)

    return

    import time

    begin = time.time()

    #test()
    #score_corpus()
    process()

    duration = time.time() - begin
    dif = '{0} minutes {1} seconds'.format(duration // 60,
                                           round(duration % 60, 2))
    print(dif)
def main():
    from sx_storage import sxDBStorage
    dbsx = sxDBStorage()
    dbsx.ConnectMaps()
    fireNews = dbsx.LoadUnclassifiedFireNews()
    #print(fireNews)
    model = min_cl.learn()
    for fn in fireNews:
        predict = min_cl.predict((fn["title"], fn["body"]), model)[0]
        print predict
        #news = fire_classifier.News(fn["title"], fn["body"])
        #predict = fire_classifier.classify_tuple(news)
        #print predict
        dbsx.UpdateFireNewsClass(fn["id"], predict)
def classify(type, file, lang):
    name, ext = splitext(file)
    pickle_fname = name + '.pck'
    if isfile(pickle_fname):  #if model exists, load it
        print "Loading model..."
        model = pickle.load(open(pickle_fname, 'rb'))
        print "Model is loaded."
    else:  #else learn and dump
        print "Learning model..."
        model = mincl.learn(file, lang)
        print "Model is loaded. Dumping model..."
        pickle.dump(model, open(pickle_fname, 'wb'))
        print "Model is dumped."
    dbsx = sxDBStorage()
    dbsx.ConnectMaps()
    news = dbsx.LoadUnclassifiedNews(type)
    for fn in news:
        #print fn["title"]
        #time.sleep(5)
        predict = mincl.predict((fn["title"], fn["body"]), model)[0]
        print predict
        dbsx.UpdateNewsClass(fn["id"], predict)
        #return
    return

def custom_strip_tags(value):
    soup = BeautifulSoup(value)
    allFontTags = soup.find_all("font", {"size": "-1"})
    if (len(allFontTags) > 0):
        content = soup.find_all("font", {"size": "-1"})[1]
    else:
        content = value
    result = re.sub(r'<[^>]*?>', ' ', unicode(content))

    return unicode(result)


#try:
dbsx = sxDBStorage()

dbsx.ConnectMaps()
print "connect"
for url in news_sources.keys():
    #if news_sources[url] != "fires_eng":
    #	continue
    try:
        fires_lenta = feedparser.parse(url)

        conn = odbc.odbc(
            "Driver={SQL Server Native Client 10.0};Server=___;Failover_PartnerPartner=___;Database=___;Uid=___;Pwd=___"
        )
        cur = conn.cursor()

        for entry in fires_lenta.entries:
        item2center[i] = item2center.setdefault(i, None)
    return item2center


def clusterizer(iterable, key1="Title", key2="Description"):
    vect = TfidfVectorizer()
    X = np.array([" ".join([elem[key1] + elem[key2]]) for elem in iterable])
    t = vect.fit_transform(X)
    ds = distance.squareform(distance.pdist(t.toarray()))
    items2neighbours = init_cluster(ds, 1.2)
    clusters = clusterize(items2neighbours, ds)
    return clusters


if __name__ == '__main__':
    db = sxDBStorage()
    db.ConnectMaps()
    date_start = dt.datetime.utcnow() + dt.timedelta(hours=-24)
    data = db.LoadLastNews(date_start.strftime("%Y-%m-%d %H:%M:%S"))
    count = len(data)
    clustered_data = clusterizer(data, "title", "body")
    clusters = []
    for i in range(0, count):
        if clustered_data[i] == None:
            clusters.append((str(data[i]["id"]),
                             str("NULL")))  #print data[i]["id"], "NULL"
        else:
            clusters.append(
                (str(data[i]["id"]), str(data[clustered_data[i]]["id"])
                 ))  #print data[i]["id"], data[clustered_data[i]]["id"]
    #print clusters