-
Notifications
You must be signed in to change notification settings - Fork 0
/
duplicates.py
170 lines (140 loc) · 6.17 KB
/
duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# coding: utf-8
#checks for duplicates in the db, and removes them.
import logging
from google.appengine.ext import db
from google.appengine.api import memcache
from models import Article, Company
logging.getLogger().setLevel(logging.DEBUG)
def companies():
q = Company.all(keys_only=True)
# q.order("datetime") no datetime on keys
company_keys = q.fetch(1000)
duplicates = []
checked = []
delete_ctr = 0
for key in company_keys:
company = Company.get_by_id(key.id())
# logging.debug("company key id: %s", key.id())
# logging.debug("company key name: %s", key.name())
# logging.debug("company name: %s", company.name())
# if company.name == "General Electric Company":
# company.exchange = "NYSE"
# company.put()
# # db.delete(company)
#dev only:
# #comp_list = ["GOOG","FB","AAPL","GE","TSLA"]
# if company.exchange == "NYSE" and company.ticker != "GE":
# if company.ticker == "IBM":
# db.delete(company)
if company.name in duplicates:
db.delete(company)
delete_ctr += 1
else:
duplicates.append(company.name)
checked.append(company)
duplicates = []
for company in checked:
if company.ticker in duplicates:
db.delete(company)
delete_ctr += 1
else:
duplicates.append(company.ticker)
logging.debug("deleted %s duplicate companies", delete_ctr)
# redundant because of titles check in scrape - and wrong, because it checks on keys, which are, of course, unique!
def articles():
article_keys = memcache.get("article_keys")
# logging.debug("dupe() article keys: %s", article_keys[:3])
duplicate_check = memcache.get("duplicate_check")
# dupe check skal inneholde:
# i verste fall:
# alle fra db - hente derfra hvis ingen dupe check
# i beste fall:
# de som ble skrapet forrige gang = article keys nå + dupe checks skal lagres som neste dupe check
delete_ctr = 0
if article_keys: # lagret av forrige scrape
# logging.debug("article keys exist in dupes")
if duplicate_check: # lagret av forrige duplicates
for article_key in article_keys:
article = Article.get_by_id(article_key.id())
if article:
if article.title in duplicate_check:
db.delete(article)
article_keys.remove(article_key)
delete_ctr += 1
else:
duplicate_check.append(article.title)
memcache.set("duplicate_check", duplicate_check)
memcache.set("article_keys", article_keys, 7200)
else:
q = Article.all(keys_only=True)
# q.order("datetime") no datetime on keys
keys = q.fetch(500)
duplicate_check = []
for key in keys:
article = Article.get_by_id(key.id())
duplicate_check.append(article.title)
for article_key in article_keys:
article = Article.get_by_id(key.id())
if article:
if article.title in duplicate_check:
db.delete(article)
article_keys.remove(article_key)
delete_ctr += 1
else:
duplicate_check.append(article.title)
memcache.set("duplicate_check", article_keys, 7200)
memcache.set("article_keys", article_keys, 7200)
elif duplicate_check:
# logging.debug("article keys do NOT exist in dupes")
memcache.set("duplicate_check", duplicate_check, 7200)
logging.debug("deleted %s duplicate articles", delete_ctr)
# not used right now: (use it in the run through all old articles routine that will use a larger instance)
def all_articles():
# q = Article.all(keys_only=True) #fetching only the key, not the whole object.
# fetch only the titles, and use them for comparison
q = Article.all()
# q = Article.all(projection=["title"]) funker ikke. feil med syntaks i doku.
article_keys = q.fetch(10000) # this is just wrong! you are not fetching the keys here.
duplicates = []
for article_key in article_keys: # this is also just wrong - never dupe check on keys - they are all different!
if article_key.title in duplicates:
db.delete(article_key)
else:
duplicates.append(article_key.title)
# -------------------------------------
# if duplicate_check:
# # her bør du adde dupe check og art keys. og sjekken bør være om det finnes art keys, ikke dupe checks.
# duplicates = []
# checked = []
# for article_id in duplicate_check: # removes the last (newest) in list
# article = Article.get_by_id(article_id)
# if article.url in duplicates:
# db.delete(article)
# duplicate_check.remove(article.key().id())
# if article_keys:
# article_keys.remove(article.key().id())
# else:
# duplicates.append(article.url)
# checked.append(article.title)
# duplicate_check.append(article.key().id())
# duplicates = []
# for title in checked:
# if title in duplicates:
# db.delete(article)
# duplicate_check.remove(article.key().id())
# if article_keys:
# article_keys.remove(article.key().id())
# else:
# duplicates.append(article.title)
# memcache.set("duplicate_check", duplicate_check, 11000)
# if article_keys:
# memcache.set("article_keys", article_keys, 11000)
# # articles = Article.all()
# # duplicates = []
# # for article in articles:
# # if article.content in duplicates: # probably never hits, because text is cleaned before new ones come in.
# # db.delete(article)
# # else:
# # duplicates.append(article.content)
def plagiarism(text): # later, more sophisticated dup check on the article content (text).
pass