This repository has been archived by the owner on Nov 30, 2021. It is now read-only.
/
cate_mongo.py
96 lines (65 loc) · 2.46 KB
/
cate_mongo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#CATEGORIZATION PROGRAM: searches occurrences for keywords in four domains. Articles from freebase are saved in database.
#Mongodb regular expressions are used which eliminates need searching using NLTK. It reduces need for parsing every article repeately
#DEPENDACY : mongodb
#
#
from pymongo import Connection
import re
from imdb import IMDb
i = 0
#connecting with mongodb
connection = Connection()
db = connection.dataset_db
class categorization:
#class cha constructor
def __init__(self):
print "hello"
def init(self):
ismovie = 0#function to initialise the variables
def get_category(self,kw):
category = []
count_film = 0
count_books = 0
count_location = 0
count_people = 0
threshold = 0
ismovie = 0
self.init()
count_film += db.film_data.find({"name": {"$regex":kw, "$options":"i"}}).count()
print "\nname count = %d"%count_film
if(db.film_data.find({"name": {"$regex":kw, "$options":"i"}}).count()):
ismovie = 1#if count is more than 1
self.search_film(kw)
count_film += db.film_data.find({"article": {"$regex":kw,"$options":"i"}}).count()
count_books += db.book_data.find({"name": {"$regex":kw, "$options":"i"}}).count()
count_books += db.book_data.find({"article": {"$regex":kw,"$options":"i"}}).count()
count_location += db.location_data.find({"name": {"$regex":kw, "$options":"i"}}).count()
count_location += db.location_data.find({"article": {"$regex":kw,"$options":"i"}}).count()
count_people += db.people_data.find({"name": {"$regex":kw, "$options":"i"}}).count()
count_people += db.people_data.find({"article": {"$regex":kw,"$options":"i"}}).count()
threshold = (count_film+count_books+count_location+count_people)/4
print "\n\n" + kw + "\nfilm:"+ str(count_film)+"\nbook:"+str(count_books)+"\npeople:"+str(count_people)+"\nlocation:"+str(count_location)
print "CATEGORY:"
if (threshold< count_film):
print " film "
category.append("film")
#search_film()
if (threshold< count_books):
category.append("books")
print " books "
if (threshold< count_people):
category.append("people")
print " people "
if (threshold< count_location):
category.append("location")
print " location "
return category
def search_film(self, kw):
#make connection
ia = IMDb()
movie_list = ia.search_movie(kw)
movie = movie_list[0]
movie_id = ia.get_movie(movie.movieID)
plot = movie_id.get("plot",[''])[0]
plot = plot.split('::')[0]
print plot