/
test.py
78 lines (65 loc) · 2.39 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from unicodecsv import DictReader
from datetime import datetime
from whoosh.filedb.filestore import RamStorage
from whoosh.fields import TEXT, NUMERIC, Schema
from whoosh.query import Term, Or
from collections import OrderedDict
import re
# import pdb; pdb.set_trace()
def parse_final_csv():
f = open('final.csv', 'r')
r = DictReader(f)
parsed = list(r)
for line in parsed:
line["date"] = datetime.strptime(line["date"], '%d/%m/%y').date()
line["amount"] = float(line["amount"])
positive = [x for x in parsed if x['amount'] > 0]
by_date = sorted(positive, key=lambda x: x["date"])
return by_date
by_date = parse_final_csv()
learning_len = 2 * len(by_date) / 3
to_learn = by_date[:learning_len]
to_test = by_date[learning_len:]
def mock_tenants(prev_imports):
dedup = list(OrderedDict.fromkeys([l["tenant"].lower() for l in to_learn]))
result = []
for i in range(0, len(dedup)):
result.append({"id": i, "name": dedup[i]})
return result
known_tenants = mock_tenants(to_learn)
def create_tenant_schema(tenants):
tenant_schema = Schema(name=TEXT(stored=True), id=NUMERIC(stored=True))
tenant_storage = RamStorage()
tenant_ix = tenant_storage.create_index(tenant_schema)
tenant_writer = tenant_ix.writer()
for t in tenants:
tenant_writer.add_document(id=t["id"], name=t["name"].lower())
tenant_writer.commit()
return tenant_ix
tenant_ix = create_tenant_schema(known_tenants)
# import_schema = Schema(
# tenant=TEXT(stored=True),
# text=TEXT(stored=True),
# amount=NUMERIC(stored=True))
# import_ix = FileIndex(RamStorage(), import_schema)
# import_writer = import_ix.writer()
# for t in prev_imports:
# import_writer.add_document(
# tenant=t["tenant"],
# text=t["text"],
# amount=t["amount"])
# import_writer.commit()
end_result = []
with tenant_ix.searcher() as searcher:
for toto in to_test:
words = re.split('\W+', toto['name'])
query = Or([Term("name", t.lower()) for t in words])
result = searcher.search(query)
matches = [{
"tenant": known_tenants[r["id"]],
"score":r.score
} for r in result]
end_result.append({"import": toto["name"], "matches": matches})
get_score = lambda r: 0 if len(r["matches"]) == 0 else r["matches"][0]["score"]
s = sorted(end_result, reverse=True, key=get_score)
print("\n".join(map(unicode, s)))