/
review_to_db.py
65 lines (53 loc) · 1.74 KB
/
review_to_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import re
from pymongo import MongoClient
from collections import Counter
from konlpy.tag import Kkma
from konlpy.utils import pprint
import hashlib
import datetime
from tags import tags as TAGS
#TAGS = [tag.encode('utf-8') for tag in TAGS]
client = MongoClient('localhost', 27017)
db = client['neural']
#collection = db['poet']
collection = db['review']
def get_poets():
string = ""
with open('out.txt') as f:
#string += f.read()[:26915].decode('utf-8' 'ignore')
string += f.read().decode('utf-8', 'ignore')
with open('demo.txt') as f:
#string += f.read()[:8000].decode('utf-8' 'ignore')
string += f.read().decode('utf-8', 'ignore')
return re.sub('\n\n+', '\t', string).split('\t')
def get_reviews():
with open('movie_out.txt') as f:
string = f.read().decode('utf-8', 'ignore')
return string.split('\n')
tag_needed = False
if tag_needed:
#poets = get_poets()
poets = get_reviews()
for poet in poets:
sentences = poet.split('\n')
for sentence in sentences:
try:
c += Counter(kkma.nouns(sentence))
except NameError:
c = Counter(kkma.nouns(sentence))
except:
pass
#poets = get_poets()
poets = get_reviews()
kkma = Kkma()
for idx, poet in enumerate(poets):
tags = []
for noun in kkma.nouns(poet):
if noun in TAGS:
tags.append(noun)
hash_object = hashlib.sha1(poet.encode('utf-8', 'ignore'))
hex_dig = hash_object.hexdigest()
results = collection.find_one({'hex':hex_dig})
if not results:
document = {'text': poet, 'index': idx, 'tags': tags, 'hex': hex_dig, 'like': 0, 'date': datetime.datetime.utcnow()}
collection.insert(document)