-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse2.py
90 lines (72 loc) · 1.98 KB
/
parse2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import django
import csv
import sys
import os
import re
sys.path.append('.')
os.environ['DJANGO_SETTINGS_MODULE'] = 'papers_site.settings'
from base.models import Paper, Keyword, Journal
django.setup()
with open('papers.csv', 'r') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for i, row in enumerate(csvreader):
print i
# if i > 1:
# sys.exit()
# skip header
if i == 0:
continue
# authors
author = row[0]
# date
date = row[1].strip('.')
date = date.strip('?')
bad_things = [
'-.*',
' .*',
',.*',
'\..*',
]
for bad in bad_things:
date = re.sub(bad, '', date)
try:
int(date)
except:
date = None
# title
title = row[2].strip('.')
title = re.sub('\^([^^]*)\^', r'<i>\1</i>', title)
# journal
try:
journal = Journal.objects.get(orig_id = int(row[4]))
except:
journal = None
print row[4]
# volume
volume = row[5]
# page start
page_start = row[6]
# page end
page_end = row[7]
# reprint
try:
reprint = bool(int(row[10]))
except:
reprint = False
# lookup number for pdf
lookup = row[11]
paper = Paper(author=author,
date=date,
title=title,
journal=journal,
volume = volume,
page_start=page_start,
page_end=page_end,
reprint=reprint,
lookup=lookup)
paper.save()
# keywords
kwords = [x.strip() for x in row[8].split(',')]
for kword in kwords:
obj, created = Keyword.objects.get_or_create(keyword=kword)
paper.keywords.add(obj)