def process_item(self, item, spider): if item['title'] is not None and item['posttime'] is not None and item['content'] is not None: # 使用cursor()方法获取操作游标 cursor = self.db.cursor() # SQL 查询语句 query = "SELECT id FROM article WHERE linkmd5id = '%s'" % (item['linkmd5id']) # print sql try: cursor.execute(query) if cursor.rowcount == 0: stopwords = fenci.getStopWords() arr = fenci.fenci(item['contentText'], stopwords) #print 'contentWords:%s' % contentWords contentWords = ' '.join(arr) # SQL 插入语句 sql = "INSERT INTO article(title, \ posttime, source_url, source_name, content, \ content_text, link, linkmd5id, crawl_site, content_words) \ VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' )" % \ (item['title'], item['posttime'], item['sourceUrl'], item['sourceName'], \ item['content'], item['contentText'], item['link'], item['linkmd5id'], item['crawlSite'], contentWords) # 执行sql语句 cursor.execute(sql) # 提交到数据库执行 self.db.commit() print("-------insert success--------") else: print("-------duplicate link--------")
def index(): return render_template('template.html', weibo_data=BaseModel().select()[:10], baidu_data=BaiduModal().select()[:10], zhihu_data=ZhihuModal().select()[:10], weixin_data=WeixinModal().select()[:10], hotword=fenci())
def shaomiao(request): count=int(1192337-935285) if count>0: t=935285 while t<count+t: try: event_c=models.NewEventParagraph.objects.get(id=t) event_id=event_c.cat_name_id event_username=[] event_username.extend(fenci.fenci(str(event_c.txt))) x=0 while x<int(len(event_username)): models.jiabinc.objects.create( username=event_username[x], cat_event_id=event_id, baikeURL='http://baike.baidu.com/search/word?word='+str(event_username[x])+'', homeurl='http://www.huodongjia.com/event-'+str(event_id)+'.html' ) x+=1 except: pass t+=1 else: return HttpResponse('没有更新') return HttpResponse('成功')
def update_event(request): count=int(len(new_add_event)) if count>0: t=0 while t<count: try: event_c=models.NewEventParagraph.objects.get(id=new_add_event[t]) event_id=event_c.cat_name_id event_username=[] event_username.extend(fenci.fenci(str(event_c.txt))) x=0 while x<int(len(event_username)): models.jiabin_m.objects.create( username=event_username[x], cat_event_id=event_id, baikeURL='http://baike.baidu.com/search/word?word='+str(event_username[x])+'', homeurl='http://www.huodongjia.com/event-'+str(event_id)+'.html' ) x+=1 t+=1 models.jiabin_event.objects.filter(id=1).update(all_event=new_add_event[t-1]) except: pass return HttpResponse('更新错误') else: return HttpResponse('没有更新') return HttpResponse('成功')
def searchBoxGetPoiInfo(self, mName): # 搜索框获取 关键字的 poi列表信息 # 参数 mName 为 '地市名,poi名 sourceName = mName.split(',') wd = fenci.fenci(sourceName[1]) # 对源poi名称进行中文分词 作为百度搜索的第一关键字 wd2 = sourceName[0] # 地市名 作为第二关键字 poiInfoList = [] # 最后一个 wd关键字 搜索出的所有poi的集合列表 poiList = self.searchBoxGetPoiList(wd, wd2) # 搜索框的结果列表 if poiList: for poi in poiList: (pylgon, csvName, name, uid, primary_uid, alias, addr, address_norm, area, area_name, catalogID, di_tag, std_tag, std_tag_id, tel, x, y, lon, lat, geo) = [[], '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''] try: csvName = sourceName[1] name = self.stripStr(poi.get('name', '')) uid = self.stripStr(poi.get('uid', '')) or '' alias = self.stripStr(poi.get('alias', '')) addr = self.stripStr(poi.get('addr', '')) address_norm = self.stripStr(poi.get('address_norm', '')) area = self.stripStr(poi.get('area', '')) area_name = self.stripStr(poi.get('area_name', '')) catalogID = self.stripStr(poi.get('catalogID', '')) di_tag = self.stripStr(poi.get('di_tag', '')) primary_uid = self.stripStr(poi.get('primary_uid', '')) std_tag = self.stripStr(poi.get('std_tag', '')) std_tag_id = self.stripStr(poi.get('std_tag_id', '')) tel = self.stripStr(poi.get('tel', '')) if isinstance(poi.get('x', ''), int): x = poi.get('x', '') / 100 y = poi.get('y', '') / 100 pointGps = self.miToGPS(x, y) lon = str(pointGps.get('lon', '')) lat = str(pointGps.get('lat', '')) x = str(x) y = str(y) geo = self.uuidGetGeo(uid) if geo == []: geo = '' print(",".join([ csvName, name, uid, primary_uid, alias, addr, address_norm, area, area_name, catalogID, di_tag, std_tag, std_tag_id, tel, x, y, lon, lat, geo ])) poiInfoList.append(",".join([ csvName, name, uid, primary_uid, alias, addr, address_norm, area, area_name, catalogID, di_tag, std_tag, std_tag_id, tel, x, y, lon, lat, geo + '\n' ])) except Exception as e: print(e) with open(self.poisFile, mode='a+', encoding='gbk', errors=None) as f: # 将采集进度写入文件 f.writelines(poiInfoList) with open(self.currFile, mode='w', encoding='gbk', errors=None) as f: # 将采集进度写入文件 f.writelines(mName)
def tochart(path): df = pd.read_excel(path, sheet_name=0, encoding='ANSI') df.reset_index() page = Page(page_title='7月事件单分析TOP10') #Bar bar = Bar(width=1000, height=700) collist = df.columns.values.tolist() fenlei = df[collist[0]] for col in range(1, len(collist) - 1): ds = collist[col] list2 = df[ds] bar.add(ds, fenlei, list2, is_stack=True, bar_category_gap='40%', xaxis_interval=0, xaxis_rotate=15, yaxis_rotate=30) page.add_chart(bar, name="bar") #词云图+饼图 top = "" num = 30 wordcloud = [] pie = [] for i in range(0, 3): keyword = [] value = [] top = fenlei[i] fenci.fenci(top, num, keyword, value) #调用fenci print(keyword, value) #词云图 wordcloud.append( WordCloud(title='↑关键词分析(TOP30):' + str(top), title_text_size=14, title_top='bottom', width=500, height=500)) wordcloud[i].add(top, keyword, value, word_size_range=[20, 60], shape='diamond') page.add_chart(wordcloud[i], name='wordcloud' + str(i)) #饼图 pie.append( Pie(title='↑关键词分析(TOP10):' + str(top), title_text_size=14, title_top='bottom', width=600, height=500)) pie[i].add(top, keyword[0:10], value[0:10], radius=[30, 60], label_text_color=None, is_label_show=True, legend_orient="vertical", legend_pos="left") page.add_chart(pie[i], name='pie' + str(i)) print('-' * 10) page.render('7月事件单分析TOP10+关键词.html') return 0
#coding:utf8 import sys from fenci import fenci fc = fenci() fc.init_fenci() ret = fc.get_text_fc('王晓明是个大坏蛋,十点二十分天马行空般的去打篮球') for i in ret: print i[0] print '-----'