-
Notifications
You must be signed in to change notification settings - Fork 0
/
news.py
175 lines (147 loc) · 4.33 KB
/
news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from datetime import datetime
import parser
import sys
import re
import os
import glob
import json
import arabic
from utils import exists, is_file, ls
#sys.setdefaultencoding('utf8')
######################Utils###########################
#--------------------------------------------------#
def stringfile(filePath):
f = open(filePath, 'r')
s = f.read()
f.close()
return s
#--------------------------------------------------#
def fix_dirpath(dirpath):
fixedDirPath = dirpath
if len(dirpath) > 0 and dirpath[-1] != "/":
fixedDirPath += "/"
return fixedDirPath
#--------------------------------------------------#
def purify_txt(txt):
# remove hyperlink: needed when a tweet contains a link that is not of concerned
# at this level of the project (code from stackoverflow.com)
patterns = [r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',r'[^\w#]']
for p in patterns:
pattern = re.compile(p,re.UNICODE)
p_cut = pattern.sub(' ', txt)
return p_cut
######################################################
###################i/o functions####################
def news_decoder(dic):
title = None
content = None
author = None
url = None
source = None
ext_date = None
return Article(dic["title"],dic["content"],dic["author"],dic["url"],dic["source"],dic["date_extracted"])
#--------------------------------------------------#
def load_news(path,decoder):
newsOjectsList = None
newsFiles = []
if not is_file(path):
newsFiles = ls(fix_dirpath(path)+"*.json",pattern=True)
else:
newsFiles = [path]
if len(newsFiles) <= 0:
print("No files found")
return None
newsOjectsList = load_files(newsFiles,decoder)
if len(newsOjectsList) <= 0:
return None
collectedReads = []
globalId = 0
for i in newsOjectsList:
for a in i:
globalId += 1
a.set_id(globalId)
collectedReads.append(a)
return collectedReads
#--------------------------------------------------#
def load_files(files,decoder):
fileObjects = []
for f in files:
if exists(f) and is_file(f):
obj = json.loads(stringfile(f), object_hook=decoder)
fileObjects.append(obj)
return fileObjects
######################################################
######################Classes#########################
class Article:
def __init__(self,title=None,content=None,author=None,url=None,source=None,ext_date=None):
self.title = title
self.content = content
self.clean_content = arabic.clean_text(purify_txt(content))
self.author=author
self.url = url
self.source = source
if ext_date:
self.ext_date = datetime.strptime(ext_date, '%Y-%m-%d %H:%M:%S')
def set_id(self,id):
self.id = id
def __str__(self):
return self.content
def __len__(self):
return len(self.content)
class NewsCorpus:
def __init__(self,id,name=None, fileName = None):
self.id = id
self.name = name
self.start = 0
self.articles = []
if fileName:
self.populate(fileName)
def populate(self,path):
articles = load_news(path,news_decoder)
if articles:
self.articles = articles
else:
sys.exit()
def __getitem__(self,index):
return self.articles[index]
def __len__(self):
return len(self.articles)
def __iter__(self):
return self
def __next__(self):
if self.start >= len(self.articles):
self.start = 0
raise StopIteration
else:
self.start += 1
return self.articles[self.start-1]
def get_articles_by_author(self,author):
aToReturn = []
if author:
for a in self.articles:
if unicode(author).encode('utf-8') == unicode(a.author).encode('utf-8'):
aToReturn.append(a)
return aToReturn
def get_article_by_id(self,id):
if id > 0 and id < len(self.articles):
return self.articles[id]
return None
def get_all_articles_contents(self,threshold=-1):
"""
threshold: minmum number of characters in the content
"""
contents = []
for a in self.articles:
if len(a.content) > threshold:
contents.append(a.content)
return contents
def search(self,query):
hits = []
for a in self.articles:
occurs = [m.start() for m in re.finditer(query, a.clean_content)]
if len(occurs) > 0:
hits.append((a.id,len(occurs)))
return hits
####################################################