def main(): from sx_storage import sxDBStorage dbsx = sxDBStorage() dbsx.ConnectMaps() fireNews = dbsx.LoadUnclassifiedFireNews() for fn in fireNews: news = News(fn["title"], fn["body"]) predict = classify_tuple(news) dbsx.UpdateFireNewsClass(fn["id"], predict) #print(news, predict) return import time begin = time.time() #test() #score_corpus() process() duration = time.time() - begin dif = '{0} minutes {1} seconds'.format(duration // 60, round(duration % 60, 2)) print(dif)
def main(): from sx_storage import sxDBStorage dbsx = sxDBStorage() dbsx.ConnectMaps() fireNews = dbsx.LoadUnclassifiedFireNews() #print(fireNews) model = min_cl.learn() for fn in fireNews: predict = min_cl.predict((fn["title"], fn["body"]), model)[0] print predict #news = fire_classifier.News(fn["title"], fn["body"]) #predict = fire_classifier.classify_tuple(news) #print predict dbsx.UpdateFireNewsClass(fn["id"], predict)
def classify(type, file, lang): name, ext = splitext(file) pickle_fname = name + '.pck' if isfile(pickle_fname): #if model exists, load it print "Loading model..." model = pickle.load(open(pickle_fname, 'rb')) print "Model is loaded." else: #else learn and dump print "Learning model..." model = mincl.learn(file, lang) print "Model is loaded. Dumping model..." pickle.dump(model, open(pickle_fname, 'wb')) print "Model is dumped." dbsx = sxDBStorage() dbsx.ConnectMaps() news = dbsx.LoadUnclassifiedNews(type) for fn in news: #print fn["title"] #time.sleep(5) predict = mincl.predict((fn["title"], fn["body"]), model)[0] print predict dbsx.UpdateNewsClass(fn["id"], predict) #return return
def custom_strip_tags(value): soup = BeautifulSoup(value) allFontTags = soup.find_all("font", {"size": "-1"}) if (len(allFontTags) > 0): content = soup.find_all("font", {"size": "-1"})[1] else: content = value result = re.sub(r'<[^>]*?>', ' ', unicode(content)) return unicode(result) #try: dbsx = sxDBStorage() dbsx.ConnectMaps() print "connect" for url in news_sources.keys(): #if news_sources[url] != "fires_eng": # continue try: fires_lenta = feedparser.parse(url) conn = odbc.odbc( "Driver={SQL Server Native Client 10.0};Server=___;Failover_PartnerPartner=___;Database=___;Uid=___;Pwd=___" ) cur = conn.cursor() for entry in fires_lenta.entries:
item2center[i] = item2center.setdefault(i, None) return item2center def clusterizer(iterable, key1="Title", key2="Description"): vect = TfidfVectorizer() X = np.array([" ".join([elem[key1] + elem[key2]]) for elem in iterable]) t = vect.fit_transform(X) ds = distance.squareform(distance.pdist(t.toarray())) items2neighbours = init_cluster(ds, 1.2) clusters = clusterize(items2neighbours, ds) return clusters if __name__ == '__main__': db = sxDBStorage() db.ConnectMaps() date_start = dt.datetime.utcnow() + dt.timedelta(hours=-24) data = db.LoadLastNews(date_start.strftime("%Y-%m-%d %H:%M:%S")) count = len(data) clustered_data = clusterizer(data, "title", "body") clusters = [] for i in range(0, count): if clustered_data[i] == None: clusters.append((str(data[i]["id"]), str("NULL"))) #print data[i]["id"], "NULL" else: clusters.append( (str(data[i]["id"]), str(data[clustered_data[i]]["id"]) )) #print data[i]["id"], data[clustered_data[i]]["id"] #print clusters